<div dir="ltr"><br><br><div class="gmail_quote"><br>            for (int x = 0; x < bx; x += 16)<br>            {<br>

-                Vec16uc word0, word1;<br>

-                Vec8s word3, word4;<br>

-                word0.load_a(src0 + x);<br>

-                word1.load_a(src1 + x);<br>

-                word3 = extend_low(word0) - extend_low(word1);<br>

-                word4 = extend_high(word0) - extend_high(word1);<br>

-                word3.store_a(dst + x);<br>

-                word4.store_a(dst + x + 8);<br>

+                __m128i word0, word1;<br>

+                __m128i word3, word4;<br>

+                __m128i mask = _mm_setzero_si128();<br>

+<br>>>+                word0 = _mm_load_si128((__m128i const*)(src0 + x));    // load 16 bytes from src1<br>>>+                word1 = _mm_load_si128((__m128i const*)(src1 + x));    // load 16 bytes from src2</div>

<div class="gmail_quote"><br></div><div class="gmail_quote">Please, notice the variable names while writing comments, it should be src0 and src1 not src1 and src2.<br>

+<br>

+                word3 = _mm_unpacklo_epi8(word0, mask);    // interleave with zero extensions<br>

+                word4 = _mm_unpacklo_epi8(word1, mask);<br>

+                _mm_store_si128((__m128i*)&dst[x], _mm_subs_epi16(word3, word4));    // store block into dst<br>

+<br>

+                word3 = _mm_unpackhi_epi8(word0, mask);    // interleave with zero extensions<br>

+                word4 = _mm_unpackhi_epi8(word1, mask);<br>

+                _mm_store_si128((__m128i*)&dst[x + 8], _mm_subs_epi16(word3, word4));    // store block into dst<br>

             }<br>

<br></div><div class="gmail_quote">I think we should try to unroll the loop for multiple of 8 also, that may give you some more performance gain. </div><div class="gmail_quote"><br></div><div class="gmail_quote">Regards,<br>

Praveen</div></div>