<div dir="ltr"><br><div class="gmail_extra"><br><br><div class="gmail_quote">On Thu, Oct 17, 2013 at 7:17 AM, <span dir="ltr"><<a href="mailto:yuvaraj@multicorewareinc.com" target="_blank">yuvaraj@multicorewareinc.com</a>></span> wrote:<br>
<blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"># HG changeset patch<br>
# User Yuvaraj Venkatesh <<a href="mailto:yuvaraj@multicorewareinc.com">yuvaraj@multicorewareinc.com</a>><br>
# Date 1382012201 -19800<br>
# Thu Oct 17 17:46:41 2013 +0530<br>
# Node ID 77f60b1e4441ab947f75291eadf199d2f3ad1057<br>
# Parent fc9dbd798ac37ec1acc0596aa179f0deb586c092<br>
pixel16: converted sad_4 from vector class to intrinsic<br></blockquote><div><br></div><div>Queued, but this is not where you should be spending your time.</div><div><br></div><div>We have an urgent need to convert the remaining vector class 8bpp primitives (HIGH_BIT_DEPTH=0) to intrinsics, and we are almost done. The only ones left are in intra-sse3.cpp, blockcopy-avx2.cpp, and pixel-avx2.cpp.</div>
<div><br></div><div>Of those, the intra DC and planar primitives have the highest priority, followed by AVX2, followed by the intra-angular functions.</div><div><br></div><div>I am very tempted to just delete the 8bpp vector class intra-angular functions and use the C references until we generate assembly for those because I think their general concept needs to be redesigned.</div>
<div><br></div><div>The HIGH_BIT_DEPTH=1 primitives will all go directly to assembly code because we are in no rush for those.</div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
diff -r fc9dbd798ac3 -r 77f60b1e4441 source/common/vec/pixel16-sse41.cpp<br>
--- a/source/common/vec/pixel16-sse41.cpp Thu Oct 17 14:14:40 2013 +0530<br>
+++ b/source/common/vec/pixel16-sse41.cpp Thu Oct 17 17:46:41 2013 +0530<br>
@@ -41,42 +41,38 @@<br>
template<int ly><br>
int sad_4(pixel * fenc, intptr_t fencstride, pixel * fref, intptr_t frefstride)<br>
{<br>
- Vec8s m1, n1;<br>
+ __m128i sum1 = _mm_setzero_si128();<br>
+ __m128i T00, T01, T02, T03;<br>
+ __m128i T10, T11, T12, T13;<br>
+ __m128i T20, T21;<br>
<br>
- Vec4i sum(0);<br>
- Vec8us sad(0);<br>
- int max_iterators = (ly >> 4) << 4;<br>
- int row;<br>
+ for (int i = 0; i < ly; i += 4)<br>
+ {<br>
+ T00 = _mm_loadl_epi64((__m128i*)(fenc + (i + 0) * fencstride));<br>
+ T01 = _mm_loadl_epi64((__m128i*)(fenc + (i + 1) * fencstride));<br>
+ T01 = _mm_unpacklo_epi64(T00, T01);<br>
+ T02 = _mm_loadl_epi64((__m128i*)(fenc + (i + 2) * fencstride));<br>
+ T03 = _mm_loadl_epi64((__m128i*)(fenc + (i + 3) * fencstride));<br>
+ T03 = _mm_unpacklo_epi64(T02, T03);<br>
<br>
- for (row = 0; row < max_iterators; row += 16)<br>
- {<br>
- for (int i = 0; i < 16; i++)<br>
- {<br>
- m1.load_a(fenc);<br>
- n1.load(fref);<br>
- sad += abs(m1 - n1);<br>
+ T10 = _mm_loadl_epi64((__m128i*)(fref + (i + 0) * frefstride));<br>
+ T11 = _mm_loadl_epi64((__m128i*)(fref + (i + 1) * frefstride));<br>
+ T11 = _mm_unpacklo_epi64(T10, T11);<br>
+ T12 = _mm_loadl_epi64((__m128i*)(fref + (i + 2) * frefstride));<br>
+ T13 = _mm_loadl_epi64((__m128i*)(fref + (i + 3) * frefstride));<br>
+ T13 = _mm_unpacklo_epi64(T12, T13);<br>
+ T20 = _mm_sub_epi16(T01, T11);<br>
+ T20 = _mm_abs_epi16(T20);<br>
+ T21 = _mm_sub_epi16(T03, T13);<br>
+ T21 = _mm_abs_epi16(T21);<br>
+ T21 = _mm_add_epi16(T20, T21);<br>
+ sum1 = _mm_add_epi16(sum1, T21);<br>
+ }<br>
+ sum1 = _mm_hadd_epi16(sum1, sum1);<br>
+ sum1 = _mm_unpacklo_epi16(sum1, _mm_setzero_si128());<br>
+ sum1 = _mm_hadd_epi32(_mm_hadd_epi32(sum1, sum1), sum1);<br>
<br>
- fenc += fencstride;<br>
- fref += frefstride;<br>
- }<br>
-<br>
- sum += extend_low(sad);<br>
- sad = 0;<br>
- }<br>
-<br>
- while (row++ < ly)<br>
- {<br>
- m1.load_a(fenc);<br>
- n1.load(fref);<br>
- sad += abs(m1 - n1);<br>
-<br>
- fenc += fencstride;<br>
- fref += frefstride;<br>
- }<br>
-<br>
- sum += extend_low(sad);<br>
-<br>
- return horizontal_add(sum);<br>
+ return _mm_cvtsi128_si32(sum1);<br>
}<br>
<br>
template<int ly><br>
_______________________________________________<br>
x265-devel mailing list<br>
<a href="mailto:x265-devel@videolan.org">x265-devel@videolan.org</a><br>
<a href="https://mailman.videolan.org/listinfo/x265-devel" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a><br>
</blockquote></div><br><br clear="all"><div><br></div>-- <br>Steve Borho
</div></div>