<div dir="ltr"><br><div class="gmail_extra"><br><br><div class="gmail_quote">On Thu, Oct 10, 2013 at 2:33 AM, <span dir="ltr"><<a href="mailto:dnyaneshwar@multicorewareinc.com" target="_blank">dnyaneshwar@multicorewareinc.com</a>></span> wrote:<br>
<blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"># HG changeset patch<br>
# User Dnyaneshwar Gorade <<a href="mailto:dnyaneshwar@multicorewareinc.com">dnyaneshwar@multicorewareinc.com</a>><br>
# Date 1381390085 -19800<br>
# Thu Oct 10 12:58:05 2013 +0530<br>
# Node ID ad1822b8e451ec9de4a8d679e9dee5d0c2b8fa8d<br>
# Parent 49230a47306bd4a4b7c696a7e723d664755a92d7<br>
ipfilter-ssse3.cpp: Replace filterConvertShortToPel vector class function with intrinsic.<br></blockquote><div><br></div><div>Will push in a moment; beware I've just re-ordered this file to put the last vector class method together with the vector class include.</div>
<div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
<br>
diff -r 49230a47306b -r ad1822b8e451 source/common/vec/ipfilter-ssse3.cpp<br>
--- a/source/common/vec/ipfilter-ssse3.cpp Thu Oct 10 11:59:16 2013 +0530<br>
+++ b/source/common/vec/ipfilter-ssse3.cpp Thu Oct 10 12:58:05 2013 +0530<br>
@@ -199,50 +199,102 @@<br>
}<br>
}<br>
<br>
-void filterConvertShortToPel(short *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height)<br>
+void filterConvertShortToPel(short *source, intptr_t sourceStride, pixel *dest, intptr_t destStride, int width, int height)<br>
{<br>
- short* srcOrg = src;<br>
- pixel* dstOrg = dst;<br>
+ short* src = source;<br>
+ pixel* dst = dest;<br>
int shift = IF_INTERNAL_PREC - X265_DEPTH;<br>
short offset = IF_INTERNAL_OFFS;<br>
+ offset += shift ? (1 << (shift - 1)) : 0;<br>
+ short maxval = (1 << X265_DEPTH) - 1;<br>
+ int row, col;<br>
<br>
- offset += shift ? (1 << (shift - 1)) : 0;<br>
- short maxVal = (1 << X265_DEPTH) - 1;<br>
- Vec8s minVal(0);<br>
- int row, col;<br>
- Vec8s src_c, val_c, val_zero(0);<br>
- Vec16uc val_uc;<br>
+ __m128i minval = _mm_setzero_si128();<br>
+ __m128i zeroval = _mm_setzero_si128();<br>
+ __m128i val1, val2, val3;<br>
+<br>
for (row = 0; row < height; row++)<br>
{<br>
for (col = 0; col < width - 7; col += 8)<br>
{<br>
- src_c.load(src + col);<br>
- val_c = add_saturated(src_c, offset) >> shift;<br>
- val_c = max(val_c, minVal);<br>
- val_c = min(val_c, maxVal);<br>
- val_uc = compress(val_c, val_zero);<br>
- val_uc.store_partial(8, dst + col);<br>
+ val1 = _mm_loadu_si128((__m128i const*)(source + col));<br>
+ val2 = _mm_sra_epi16(_mm_adds_epi16(val1, _mm_set1_epi16(offset)), _mm_cvtsi32_si128(shift));<br>
+ val2 = _mm_max_epi16(val2, minval);<br>
+ val2 = _mm_min_epi16(val2, _mm_set1_epi16(maxval));<br>
+<br>
+ __m128i mask = _mm_set1_epi32(0x00FF00FF); // mask for low bytes<br>
+ __m128i lowm = _mm_and_si128(val2, mask); // bytes of low<br>
+ __m128i highm = _mm_and_si128(zeroval, mask); // bytes of high<br>
+ val3 = _mm_packus_epi16(lowm, highm); // unsigned pack<br>
+<br>
+ union<br>
+ {<br>
+ int8_t c[16];<br>
+ int64_t q[2];<br>
+ } u;<br>
+ _mm_storeu_si128((__m128i*)u.c, val3);<br>
+ *(int64_t*)(dest + col) = u.q[0];<br>
}<br>
-<br>
- src += srcStride;<br>
- dst += dstStride;<br>
+ source += sourceStride;<br>
+ dest += destStride;<br>
}<br>
<br>
if (width % 8 != 0)<br>
{<br>
- src = srcOrg;<br>
- dst = dstOrg;<br>
+ source = src;<br>
+ dest = dst;<br>
col = width - (width % 8);<br>
for (row = 0; row < height; row++)<br>
{<br>
- src_c.load(src + col);<br>
- val_c = add_saturated(src_c, offset) >> shift;<br>
- val_c = max(val_c, minVal);<br>
- val_c = min(val_c, maxVal);<br>
- val_uc = compress(val_c, val_zero);<br>
- val_uc.store_partial(width - col, dst + col);<br>
- src += srcStride;<br>
- dst += dstStride;<br>
+ val1 = _mm_loadu_si128((__m128i const*)(source + col));<br>
+ val2 = _mm_sra_epi16(_mm_adds_epi16(val1, _mm_set1_epi16(offset)), _mm_cvtsi32_si128(shift));<br>
+ val2 = _mm_max_epi16(val2, minval);<br>
+ val2 = _mm_min_epi16(val2, _mm_set1_epi16(maxval));<br>
+<br>
+ __m128i mask = _mm_set1_epi32(0x00FF00FF); // mask for low bytes<br>
+ __m128i lowm = _mm_and_si128(val2, mask); // bytes of low<br>
+ __m128i highm = _mm_and_si128(zeroval, mask); // bytes of high<br>
+ val3 = _mm_packus_epi16(lowm, highm); // unsigned pack<br>
+<br>
+ int n = width - col;<br>
+ if (n >= 16)<br>
+ {<br>
+ _mm_storeu_si128((__m128i*)(dest + col), val3);<br>
+ }<br>
+ else if (n <= 0) ; // do nothing if value of is n less than 0<br>
+ else<br>
+ {<br>
+ union<br>
+ {<br>
+ int8_t c[16];<br>
+ int16_t s[8];<br>
+ int32_t i[4];<br>
+ int64_t q[2];<br>
+ } u;<br>
+ _mm_storeu_si128((__m128i*)u.c, val3);<br>
+ int j = 0;<br>
+ if (n & 8) // n == (8,9,10,11,12,13,14,15)<br>
+ {<br>
+ *(int64_t*)(dest + col) = u.q[0];<br>
+ j += 8;<br>
+ }<br>
+ if (n & 4) // n == (4,5,6,7,12,13,14,15)<br>
+ {<br>
+ ((int32_t*)(dest + col))[j/4] = u.i[j/4];<br>
+ j += 4;<br>
+ }<br>
+ if (n & 2) // n == (2,3,6,7,10,11,14,15)<br>
+ {<br>
+ ((int16_t*)(dest + col))[j/2] = u.s[j/2];<br>
+ j += 2;<br>
+ }<br>
+ if (n & 1) // n == (1,3,5,7,9,11,13,15)<br>
+ {<br>
+ ((int8_t*)(dest + col))[j] = u.c[j];<br>
+ }<br>
+ }<br>
+ source += sourceStride;<br>
+ dest += destStride;<br>
}<br>
}<br>
}<br>
_______________________________________________<br>
x265-devel mailing list<br>
<a href="mailto:x265-devel@videolan.org">x265-devel@videolan.org</a><br>
<a href="https://mailman.videolan.org/listinfo/x265-devel" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a><br>
</blockquote></div><br><br clear="all"><div><br></div>-- <br>Steve Borho
</div></div>