<div dir="ltr"><br><div class="gmail_extra"><br><br><div class="gmail_quote">On Thu, Oct 10, 2013 at 2:33 AM,  <span dir="ltr"><<a href="mailto:dnyaneshwar@multicorewareinc.com" target="_blank">dnyaneshwar@multicorewareinc.com</a>></span> wrote:<br>
<blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"># HG changeset patch<br>
# User Dnyaneshwar Gorade <<a href="mailto:dnyaneshwar@multicorewareinc.com">dnyaneshwar@multicorewareinc.com</a>><br>
# Date 1381390085 -19800<br>
#      Thu Oct 10 12:58:05 2013 +0530<br>
# Node ID ad1822b8e451ec9de4a8d679e9dee5d0c2b8fa8d<br>
# Parent  49230a47306bd4a4b7c696a7e723d664755a92d7<br>
ipfilter-ssse3.cpp: Replace filterConvertShortToPel vector class function with intrinsic.<br></blockquote><div><br></div><div>Will push in a moment; beware I've just re-ordered this file to put the last vector class method together with the vector class include.</div>
<div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
<br>
diff -r 49230a47306b -r ad1822b8e451 source/common/vec/ipfilter-ssse3.cpp<br>
--- a/source/common/vec/ipfilter-ssse3.cpp      Thu Oct 10 11:59:16 2013 +0530<br>
+++ b/source/common/vec/ipfilter-ssse3.cpp      Thu Oct 10 12:58:05 2013 +0530<br>
@@ -199,50 +199,102 @@<br>
     }<br>
 }<br>
<br>
-void filterConvertShortToPel(short *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height)<br>
+void filterConvertShortToPel(short *source, intptr_t sourceStride, pixel *dest, intptr_t destStride, int width, int height)<br>
 {<br>
-    short* srcOrg = src;<br>
-    pixel* dstOrg = dst;<br>
+    short* src = source;<br>
+    pixel* dst = dest;<br>
     int shift = IF_INTERNAL_PREC - X265_DEPTH;<br>
     short offset = IF_INTERNAL_OFFS;<br>
+    offset += shift ? (1 << (shift - 1)) : 0;<br>
+    short maxval = (1 << X265_DEPTH) - 1;<br>
+    int row, col;<br>
<br>
-    offset += shift ? (1 << (shift - 1)) : 0;<br>
-    short maxVal = (1 << X265_DEPTH) - 1;<br>
-    Vec8s minVal(0);<br>
-    int row, col;<br>
-    Vec8s src_c, val_c, val_zero(0);<br>
-    Vec16uc val_uc;<br>
+    __m128i minval  = _mm_setzero_si128();<br>
+    __m128i zeroval = _mm_setzero_si128();<br>
+    __m128i val1, val2, val3;<br>
+<br>
     for (row = 0; row < height; row++)<br>
     {<br>
         for (col = 0; col < width - 7; col += 8)<br>
         {<br>
-            src_c.load(src + col);<br>
-            val_c = add_saturated(src_c, offset) >> shift;<br>
-            val_c = max(val_c, minVal);<br>
-            val_c = min(val_c, maxVal);<br>
-            val_uc = compress(val_c, val_zero);<br>
-            val_uc.store_partial(8, dst + col);<br>
+            val1 = _mm_loadu_si128((__m128i const*)(source + col));<br>
+            val2 = _mm_sra_epi16(_mm_adds_epi16(val1, _mm_set1_epi16(offset)), _mm_cvtsi32_si128(shift));<br>
+            val2 = _mm_max_epi16(val2, minval);<br>
+            val2 = _mm_min_epi16(val2, _mm_set1_epi16(maxval));<br>
+<br>
+            __m128i mask  = _mm_set1_epi32(0x00FF00FF);           // mask for low bytes<br>
+            __m128i lowm  = _mm_and_si128(val2, mask);            // bytes of low<br>
+            __m128i highm = _mm_and_si128(zeroval, mask);         // bytes of high<br>
+            val3 = _mm_packus_epi16(lowm, highm);                 // unsigned pack<br>
+<br>
+            union<br>
+            {<br>
+                int8_t  c[16];<br>
+                int64_t q[2];<br>
+            } u;<br>
+            _mm_storeu_si128((__m128i*)u.c, val3);<br>
+            *(int64_t*)(dest + col) = u.q[0];<br>
         }<br>
-<br>
-        src += srcStride;<br>
-        dst += dstStride;<br>
+        source += sourceStride;<br>
+        dest += destStride;<br>
     }<br>
<br>
     if (width % 8 != 0)<br>
     {<br>
-        src = srcOrg;<br>
-        dst = dstOrg;<br>
+        source = src;<br>
+        dest = dst;<br>
         col = width - (width % 8);<br>
         for (row = 0; row < height; row++)<br>
         {<br>
-            src_c.load(src + col);<br>
-            val_c = add_saturated(src_c, offset) >> shift;<br>
-            val_c = max(val_c, minVal);<br>
-            val_c = min(val_c, maxVal);<br>
-            val_uc = compress(val_c, val_zero);<br>
-            val_uc.store_partial(width - col, dst + col);<br>
-            src += srcStride;<br>
-            dst += dstStride;<br>
+            val1 = _mm_loadu_si128((__m128i const*)(source + col));<br>
+            val2 = _mm_sra_epi16(_mm_adds_epi16(val1, _mm_set1_epi16(offset)), _mm_cvtsi32_si128(shift));<br>
+            val2 = _mm_max_epi16(val2, minval);<br>
+            val2 = _mm_min_epi16(val2, _mm_set1_epi16(maxval));<br>
+<br>
+            __m128i mask  = _mm_set1_epi32(0x00FF00FF);           // mask for low bytes<br>
+            __m128i lowm  = _mm_and_si128(val2, mask);            // bytes of low<br>
+            __m128i highm = _mm_and_si128(zeroval, mask);         // bytes of high<br>
+            val3 = _mm_packus_epi16(lowm, highm);                 // unsigned pack<br>
+<br>
+            int n = width - col;<br>
+            if (n >= 16)<br>
+            {<br>
+                _mm_storeu_si128((__m128i*)(dest + col), val3);<br>
+            }<br>
+            else if (n <= 0) ;    // do nothing if value of is n less than 0<br>
+            else<br>
+            {<br>
+                union<br>
+                {<br>
+                    int8_t  c[16];<br>
+                    int16_t s[8];<br>
+                    int32_t i[4];<br>
+                    int64_t q[2];<br>
+                } u;<br>
+                _mm_storeu_si128((__m128i*)u.c, val3);<br>
+                int j = 0;<br>
+                if (n & 8)    // n == (8,9,10,11,12,13,14,15)<br>
+                {<br>
+                    *(int64_t*)(dest + col) = u.q[0];<br>
+                    j += 8;<br>
+                }<br>
+                if (n & 4)    // n == (4,5,6,7,12,13,14,15)<br>
+                {<br>
+                    ((int32_t*)(dest + col))[j/4] = u.i[j/4];<br>
+                    j += 4;<br>
+                }<br>
+                if (n & 2)    // n == (2,3,6,7,10,11,14,15)<br>
+                {<br>
+                    ((int16_t*)(dest + col))[j/2] = u.s[j/2];<br>
+                    j += 2;<br>
+                }<br>
+                if (n & 1)    // n == (1,3,5,7,9,11,13,15)<br>
+                {<br>
+                    ((int8_t*)(dest + col))[j] = u.c[j];<br>
+                }<br>
+            }<br>
+            source += sourceStride;<br>
+            dest += destStride;<br>
         }<br>
     }<br>
 }<br>
_______________________________________________<br>
x265-devel mailing list<br>
<a href="mailto:x265-devel@videolan.org">x265-devel@videolan.org</a><br>
<a href="https://mailman.videolan.org/listinfo/x265-devel" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a><br>
</blockquote></div><br><br clear="all"><div><br></div>-- <br>Steve Borho
</div></div>