<div dir="ltr"><br><div class="gmail_extra"><br><br><div class="gmail_quote">On Thu, Oct 10, 2013 at 8:02 AM,  <span dir="ltr"><<a href="mailto:yuvaraj@multicorewareinc.com" target="_blank">yuvaraj@multicorewareinc.com</a>></span> wrote:<br>
<blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"># HG changeset patch<br>
# User Yuvaraj Venkatesh <<a href="mailto:yuvaraj@multicorewareinc.com">yuvaraj@multicorewareinc.com</a>><br>
# Date 1381410080 -19800<br>
#      Thu Oct 10 18:31:20 2013 +0530<br>
# Node ID d43f21a8128f02c97aafde351162d00977f664b6<br>
# Parent  840229ed3794569f5e15d84289531c829b75dcd6<br>
dct: replaced partialButterfly8 vector class function with intrinsic<br></blockquote><div><br></div><div>this new function uses pmulld, which is sse4.1, so this function needs to be moved</div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">

<br>
diff -r 840229ed3794 -r d43f21a8128f source/common/vec/dct-sse3.cpp<br>
--- a/source/common/vec/dct-sse3.cpp    Thu Oct 10 16:57:47 2013 +0530<br>
+++ b/source/common/vec/dct-sse3.cpp    Thu Oct 10 18:31:20 2013 +0530<br>
@@ -246,47 +246,48 @@<br>
     int j;<br>
     int add = 1 << (shift - 1);<br>
<br>
-    Vec4i zero_row(64, 64, 0, 0);<br>
-    Vec4i four_row(64, -64, 0, 0);<br>
-    Vec4i two_row(83, 36, 0, 0);<br>
-    Vec4i six_row(36, -83, 0, 0);<br>
+    __m128i zero_row = _mm_setr_epi32(64, 64, 0, 0);<br>
+    __m128i four_row = _mm_setr_epi32(64, -64, 0, 0);<br>
+    __m128i two_row = _mm_setr_epi32(83, 36, 0, 0);<br>
+    __m128i six_row = _mm_setr_epi32(36, -83, 0, 0);<br>
<br>
-    Vec4i one_row(89, 75, 50, 18);<br>
-    Vec4i three_row(75, -18, -89, -50);<br>
-    Vec4i five_row(50, -89, 18, 75);<br>
-    Vec4i seven_row(18, -50, 75, -89);<br>
+    __m128i one_row = _mm_setr_epi32(89, 75, 50, 18);<br>
+    __m128i three_row = _mm_setr_epi32(75, -18, -89, -50);<br>
+    __m128i five_row = _mm_setr_epi32(50, -89, 18, 75);<br>
+    __m128i seven_row = _mm_setr_epi32(18, -50, 75, -89);<br>
<br>
     for (j = 0; j < line; j++)<br>
     {<br>
-        Vec8s srcTmp;<br>
-        srcTmp.load(src);<br>
+        __m128i srcTmp;<br>
+        srcTmp = _mm_loadu_si128((__m128i*)(src));<br>
<br>
-        Vec4i E_first_half = extend_low(srcTmp);<br>
-        Vec4i E_second_half = extend_high(srcTmp);<br>
-        E_second_half = permute4i<3, 2, 1, 0>(E_second_half);<br>
+        __m128i sign = _mm_srai_epi16(srcTmp, 15);<br>
+        __m128i E_first_half = _mm_unpacklo_epi16(srcTmp, sign);<br>
+        __m128i E_second_half = _mm_unpackhi_epi16(srcTmp, sign);<br>
+        E_second_half = _mm_shuffle_epi32(E_second_half, 27);<br>
<br>
-        Vec4i E = E_first_half + E_second_half;<br>
-        Vec4i O = E_first_half - E_second_half;<br>
+        __m128i E = _mm_add_epi32(E_first_half, E_second_half);<br>
+        __m128i O = _mm_sub_epi32(E_first_half, E_second_half);<br>
<br>
-        Vec4i EE_first_half = permute4i<0, 1, -1, -1>(E);<br>
-        Vec4i EE_second_half = permute4i<3, 2, -1, -1>(E);<br>
-        Vec4i EE = EE_first_half + EE_second_half;<br>
-        Vec4i EO = EE_first_half - EE_second_half;<br>
+        __m128i EE_first_half = _mm_shuffle_epi32(E, 4);<br>
+        __m128i EE_second_half = _mm_shuffle_epi32(E, 11);<br>
+        __m128i EE = _mm_add_epi32(EE_first_half, EE_second_half);<br>
+        __m128i EO = _mm_sub_epi32(EE_first_half, EE_second_half);<br>
<br>
-        int dst0 = ((horizontal_add(zero_row * EE)) + add) >> shift;<br>
-        int dst4 = ((horizontal_add(four_row * EE)) + add) >> shift;<br>
-        int dst2 = ((horizontal_add(two_row * EO)) + add) >> shift;<br>
-        int dst6 = ((horizontal_add(six_row * EO)) + add) >> shift;<br>
+        int dst0 = (_mm_cvtsi128_si32(_mm_hadd_epi32(_mm_hadd_epi32(_mm_mullo_epi32(zero_row, EE), _mm_setzero_si128()), _mm_setzero_si128())) + add) >> shift;<br>
+        int dst4 = (_mm_cvtsi128_si32(_mm_hadd_epi32(_mm_hadd_epi32(_mm_mullo_epi32(four_row, EE), _mm_setzero_si128()), _mm_setzero_si128())) + add) >> shift;<br>
+        int dst2 = (_mm_cvtsi128_si32(_mm_hadd_epi32(_mm_hadd_epi32(_mm_mullo_epi32(two_row, EO), _mm_setzero_si128()), _mm_setzero_si128())) + add) >> shift;<br>
+        int dst6 = (_mm_cvtsi128_si32(_mm_hadd_epi32(_mm_hadd_epi32(_mm_mullo_epi32(six_row, EO), _mm_setzero_si128()), _mm_setzero_si128())) + add) >> shift;<br>
<br>
         dst[0] = dst0;<br>
         dst[4 * line] = dst4;<br>
         dst[2 * line] = dst2;<br>
         dst[6 * line] = dst6;<br>
<br>
-        int dst1 = ((horizontal_add(one_row * O)) + add) >> shift;<br>
-        int dst3 = ((horizontal_add(three_row * O)) + add) >> shift;<br>
-        int dst5 = ((horizontal_add(five_row * O)) + add) >> shift;<br>
-        int dst7 = ((horizontal_add(seven_row * O)) + add) >> shift;<br>
+        int dst1 = (_mm_cvtsi128_si32(_mm_hadd_epi32(_mm_hadd_epi32(_mm_mullo_epi32(one_row, O), _mm_setzero_si128()), _mm_setzero_si128())) + add) >> shift;<br>
+        int dst3 = (_mm_cvtsi128_si32(_mm_hadd_epi32(_mm_hadd_epi32(_mm_mullo_epi32(three_row, O), _mm_setzero_si128()), _mm_setzero_si128())) + add) >> shift;<br>
+        int dst5 = (_mm_cvtsi128_si32(_mm_hadd_epi32(_mm_hadd_epi32(_mm_mullo_epi32(five_row, O), _mm_setzero_si128()), _mm_setzero_si128())) + add) >> shift;<br>
+        int dst7 = (_mm_cvtsi128_si32(_mm_hadd_epi32(_mm_hadd_epi32(_mm_mullo_epi32(seven_row, O), _mm_setzero_si128()), _mm_setzero_si128())) + add) >> shift;<br>
<br>
         dst[line] = dst1;<br>
         dst[3 * line] = dst3;<br>
_______________________________________________<br>
x265-devel mailing list<br>
<a href="mailto:x265-devel@videolan.org">x265-devel@videolan.org</a><br>
<a href="https://mailman.videolan.org/listinfo/x265-devel" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a><br>
</blockquote></div><br><br clear="all"><div><br></div>-- <br>Steve Borho
</div></div>