<div dir="ltr"><br><div class="gmail_extra"><br><br><div class="gmail_quote">On Fri, Oct 11, 2013 at 3:40 AM,  <span dir="ltr"><<a href="mailto:yuvaraj@multicorewareinc.com" target="_blank">yuvaraj@multicorewareinc.com</a>></span> wrote:<br>
<blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"># HG changeset patch<br>
# User Yuvaraj Venkatesh <<a href="mailto:yuvaraj@multicorewareinc.com">yuvaraj@multicorewareinc.com</a>><br>
# Date 1381480768 -19800<br>
#      Fri Oct 11 14:09:28 2013 +0530<br>
# Node ID 46b954edb1c52a557b9d94c4ed380ea0578c1949<br>
# Parent  8bb743458331d7cdc1008e217542e406818c5a7a<br>
dct: Replaced partialButterfly16 vector class function to intrinsic<br></blockquote><div><br></div><div>For some reason, this new version is 3x slower than the vector version; we need to figure out why.  It looks like the code-flow is the same.</div>
<div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
diff -r 8bb743458331 -r 46b954edb1c5 source/common/vec/dct-sse3.cpp<br>
--- a/source/common/vec/dct-sse3.cpp    Fri Oct 11 12:42:16 2013 +0530<br>
+++ b/source/common/vec/dct-sse3.cpp    Fri Oct 11 14:09:28 2013 +0530<br>
@@ -1740,143 +1740,146 @@<br>
     int j;<br>
     int add = 1 << (shift - 1);<br>
<br>
-    Vec4i zero_row(64, 64, 0, 0);<br>
-    Vec4i four_row(83, 36, 0, 0);<br>
-    Vec4i eight_row(64, -64, 0, 0);<br>
-    Vec4i twelve_row(36, -83, 0, 0);<br>
+    __m128i zero_row = _mm_setr_epi32(64, 64, 0, 0);<br>
+    __m128i four_row = _mm_setr_epi32(83, 36, 0, 0);<br>
+    __m128i eight_row = _mm_setr_epi32(64, -64, 0, 0);<br>
+    __m128i twelve_row = _mm_setr_epi32(36, -83, 0, 0);<br>
<br>
-    Vec4i two_row(89, 75, 50, 18);<br>
-    Vec4i six_row(75, -18, -89, -50);<br>
-    Vec4i ten_row(50, -89, 18, 75);<br>
-    Vec4i fourteen_row(18, -50, 75, -89);<br>
+    __m128i two_row = _mm_setr_epi32(89, 75, 50, 18);<br>
+    __m128i six_row = _mm_setr_epi32(75, -18, -89, -50);<br>
+    __m128i ten_row = _mm_setr_epi32(50, -89, 18, 75);<br>
+    __m128i fourteen_row = _mm_setr_epi32(18, -50, 75, -89);<br>
<br>
-    Vec4i one_row_first_half(90, 87, 80, 70);<br>
-    Vec4i one_row_second_half(57, 43, 25,  9);<br>
-    Vec4i three_row_first_half(87, 57,  9, -43);<br>
-    Vec4i three_row_second_half(-80, -90, -70, -25);<br>
-    Vec4i five_row_first_half(80,  9, -70, -87);<br>
-    Vec4i five_row_second_half(-25, 57, 90, 43);<br>
-    Vec4i seven_row_first_half(70, -43, -87,  9);<br>
-    Vec4i seven_row_second_half(90, 25, -80, -57);<br>
-    Vec4i nine_row_first_half(57, -80, -25, 90);<br>
-    Vec4i nine_row_second_half(-9, -87, 43, 70);<br>
-    Vec4i eleven_row_first_half(43, -90, 57, 25);<br>
-    Vec4i eleven_row_second_half(-87, 70,  9, -80);<br>
-    Vec4i thirteen_row_first_half(25, -70, 90, -80);<br>
-    Vec4i thirteen_row_second_half(43,  9, -57, 87);<br>
-    Vec4i fifteen_row_first_half(9, -25, 43, -57);<br>
-    Vec4i fifteen_row_second_half(70, -80, 87, -90);<br>
+    __m128i one_row_first_half = _mm_setr_epi32(90, 87, 80, 70);<br>
+    __m128i one_row_second_half = _mm_setr_epi32(57, 43, 25,  9);<br>
+    __m128i three_row_first_half = _mm_setr_epi32(87, 57,  9, -43);<br>
+    __m128i three_row_second_half = _mm_setr_epi32(-80, -90, -70, -25);<br>
+    __m128i five_row_first_half = _mm_setr_epi32(80,  9, -70, -87);<br>
+    __m128i five_row_second_half = _mm_setr_epi32(-25, 57, 90, 43);<br>
+    __m128i seven_row_first_half = _mm_setr_epi32(70, -43, -87,  9);<br>
+    __m128i seven_row_second_half = _mm_setr_epi32(90, 25, -80, -57);<br>
+    __m128i nine_row_first_half = _mm_setr_epi32(57, -80, -25, 90);<br>
+    __m128i nine_row_second_half = _mm_setr_epi32(-9, -87, 43, 70);<br>
+    __m128i eleven_row_first_half = _mm_setr_epi32(43, -90, 57, 25);<br>
+    __m128i eleven_row_second_half = _mm_setr_epi32(-87, 70,  9, -80);<br>
+    __m128i thirteen_row_first_half = _mm_setr_epi32(25, -70, 90, -80);<br>
+    __m128i thirteen_row_second_half = _mm_setr_epi32(43,  9, -57, 87);<br>
+    __m128i fifteen_row_first_half = _mm_setr_epi32(9, -25, 43, -57);<br>
+    __m128i fifteen_row_second_half = _mm_setr_epi32(70, -80, 87, -90);<br>
<br>
     for (j = 0; j < line; j++)<br>
     {<br>
-        Vec8s tmp1, tmp2;<br>
-        tmp1.load(src);<br>
-        Vec4i tmp1_first_half = extend_low(tmp1);<br>
-        Vec4i tmp1_second_half = extend_high(tmp1);<br>
+        __m128i tmp1, tmp2;<br>
+        tmp1 = _mm_loadu_si128((__m128i*)(src));<br>
<br>
-        tmp2.load(src + 8);<br>
-        Vec4i tmp2_first_half_tmp = extend_low(tmp2);<br>
-        Vec4i tmp2_second_half_tmp = extend_high(tmp2);<br>
-        Vec4i tmp2_first_half = permute4i<3, 2, 1, 0>(tmp2_second_half_tmp);<br>
-        Vec4i tmp2_second_half = permute4i<3, 2, 1, 0>(tmp2_first_half_tmp);<br>
+        __m128i sign = _mm_srai_epi16(tmp1, 15);<br>
+        __m128i tmp1_first_half = _mm_unpacklo_epi16(tmp1, sign);<br>
+        __m128i tmp1_second_half = _mm_unpackhi_epi16(tmp1, sign);<br>
<br>
-        Vec4i E_first_half = tmp1_first_half + tmp2_first_half;<br>
-        Vec4i E_second_half_tmp = tmp1_second_half + tmp2_second_half;<br>
-        Vec4i O_first_half = tmp1_first_half - tmp2_first_half;<br>
-        Vec4i O_second_half = tmp1_second_half - tmp2_second_half;<br>
+        tmp2 = _mm_loadu_si128((__m128i*)(src + 8));<br>
+        sign = _mm_srai_epi16(tmp2, 15);<br>
+        __m128i tmp2_first_half_tmp = _mm_unpacklo_epi16(tmp2, sign);<br>
+        __m128i tmp2_second_half_tmp = _mm_unpackhi_epi16(tmp2, sign);<br>
+        __m128i tmp2_first_half = _mm_shuffle_epi32(tmp2_second_half_tmp, 27);<br>
+        __m128i tmp2_second_half = _mm_shuffle_epi32(tmp2_first_half_tmp, 27);<br>
<br>
-        Vec4i E_second_half = permute4i<3, 2, 1, 0>(E_second_half_tmp);<br>
+        __m128i E_first_half = _mm_add_epi32(tmp1_first_half, tmp2_first_half);<br>
+        __m128i E_second_half_tmp = _mm_add_epi32(tmp1_second_half, tmp2_second_half);<br>
+        __m128i O_first_half = _mm_sub_epi32(tmp1_first_half, tmp2_first_half);<br>
+        __m128i O_second_half = _mm_sub_epi32(tmp1_second_half, tmp2_second_half);<br>
<br>
-        Vec4i EE = E_first_half + E_second_half;<br>
-        Vec4i EO = E_first_half - E_second_half;<br>
+        __m128i E_second_half = _mm_shuffle_epi32(E_second_half_tmp, 27);<br>
<br>
-        Vec4i EE_first_half = permute4i<0, 1, -1, -1>(EE);<br>
-        Vec4i EE_second_half = permute4i<3, 2, -1, -1>(EE);<br>
+        __m128i EE = _mm_add_epi32(E_first_half, E_second_half);<br>
+        __m128i EO = _mm_sub_epi32(E_first_half, E_second_half);<br>
<br>
-        Vec4i EEE = EE_first_half + EE_second_half;<br>
-        Vec4i EEO = EE_first_half - EE_second_half;<br>
+        __m128i EE_first_half = _mm_shuffle_epi32(EE, 4);<br>
+        __m128i EE_second_half = _mm_shuffle_epi32(EE, 11);<br>
<br>
-        Vec4i dst_tmp0 = zero_row * EEE;<br>
-        Vec4i dst_tmp4 = four_row * EEO;<br>
-        Vec4i dst_tmp8 = eight_row * EEE;<br>
-        Vec4i dst_tmp12 = twelve_row * EEO;<br>
+        __m128i EEE = _mm_add_epi32(EE_first_half, EE_second_half);<br>
+        __m128i EEO = _mm_sub_epi32(EE_first_half, EE_second_half);<br>
<br>
-        int dst_zero = horizontal_add(dst_tmp0);<br>
-        int dst_four = horizontal_add(dst_tmp4);<br>
-        int dst_eight = horizontal_add(dst_tmp8);<br>
-        int dst_twelve = horizontal_add(dst_tmp12);<br>
+        __m128i dst_tmp0 = _mm_mullo_epi32(zero_row, EEE);<br>
+        __m128i dst_tmp4 = _mm_mullo_epi32(four_row, EEO);<br>
+        __m128i dst_tmp8 = _mm_mullo_epi32(eight_row, EEE);<br>
+        __m128i dst_tmp12 = _mm_mullo_epi32(twelve_row, EEO);<br>
<br>
-        Vec4i dst_0_8_4_12(dst_zero, dst_eight, dst_four, dst_twelve);<br>
+        int dst_zero = _mm_cvtsi128_si32(_mm_hadd_epi32(_mm_hadd_epi32(dst_tmp0, _mm_setzero_si128()), _mm_setzero_si128()));<br>
+        int dst_four = _mm_cvtsi128_si32(_mm_hadd_epi32(_mm_hadd_epi32(dst_tmp4, _mm_setzero_si128()), _mm_setzero_si128()));<br>
+        int dst_eight = _mm_cvtsi128_si32(_mm_hadd_epi32(_mm_hadd_epi32(dst_tmp8, _mm_setzero_si128()), _mm_setzero_si128()));<br>
+        int dst_twelve = _mm_cvtsi128_si32(_mm_hadd_epi32(_mm_hadd_epi32(dst_tmp12, _mm_setzero_si128()), _mm_setzero_si128()));<br>
<br>
-        Vec4i dst_result = dst_0_8_4_12 + add;<br>
-        Vec4i dst_shift_result = dst_result >> shift;<br>
+        __m128i dst_0_8_4_12 = _mm_setr_epi32(dst_zero, dst_eight, dst_four, dst_twelve);<br>
<br>
-        dst[0] = dst_shift_result[0];<br>
-        dst[8 * line] = dst_shift_result[1];<br>
-        dst[4 * line] = dst_shift_result[2];<br>
-        dst[12 * line] = dst_shift_result[3];<br>
+        __m128i dst_result = _mm_add_epi32(dst_0_8_4_12, _mm_set1_epi32(add));<br>
+        __m128i dst_shift_result = _mm_srai_epi32(dst_result, shift);<br>
<br>
-        Vec4i dst_tmp2 = two_row * EO;<br>
-        Vec4i dst_tmp6 = six_row * EO;<br>
-        Vec4i dst_tmp10 = ten_row * EO;<br>
-        Vec4i dst_tmp14 = fourteen_row * EO;<br>
+        dst[0] = _mm_cvtsi128_si32(dst_shift_result);<br>
+        dst[8 * line] = _mm_cvtsi128_si32(_mm_shuffle_epi32(dst_shift_result, 1));<br>
+        dst[4 * line] = _mm_cvtsi128_si32(_mm_shuffle_epi32(dst_shift_result, 2));<br>
+        dst[12 * line] = _mm_cvtsi128_si32(_mm_shuffle_epi32(dst_shift_result, 3));<br>
<br>
-        int dst_two = horizontal_add(dst_tmp2);<br>
-        int dst_six = horizontal_add(dst_tmp6);<br>
-        int dst_ten = horizontal_add(dst_tmp10);<br>
-        int dst_fourteen = horizontal_add(dst_tmp14);<br>
+        __m128i dst_tmp2 = _mm_mullo_epi32(two_row, EO);<br>
+        __m128i dst_tmp6 = _mm_mullo_epi32(six_row, EO);<br>
+        __m128i dst_tmp10 = _mm_mullo_epi32(ten_row, EO);<br>
+        __m128i dst_tmp14 = _mm_mullo_epi32(fourteen_row, EO);<br>
<br>
-        Vec4i dst_2_6_10_14(dst_two, dst_six, dst_ten, dst_fourteen);<br>
-        dst_2_6_10_14 = dst_2_6_10_14 + add;<br>
-        dst_2_6_10_14 = dst_2_6_10_14 >> shift;<br>
+        int dst_two = _mm_cvtsi128_si32(_mm_hadd_epi32(_mm_hadd_epi32(dst_tmp2, _mm_setzero_si128()), _mm_setzero_si128()));<br>
+        int dst_six = _mm_cvtsi128_si32(_mm_hadd_epi32(_mm_hadd_epi32(dst_tmp6, _mm_setzero_si128()), _mm_setzero_si128()));<br>
+        int dst_ten = _mm_cvtsi128_si32(_mm_hadd_epi32(_mm_hadd_epi32(dst_tmp10, _mm_setzero_si128()), _mm_setzero_si128()));<br>
+        int dst_fourteen = _mm_cvtsi128_si32(_mm_hadd_epi32(_mm_hadd_epi32(dst_tmp14, _mm_setzero_si128()), _mm_setzero_si128()));<br>
<br>
-        dst[2 * line] = dst_2_6_10_14[0];<br>
-        dst[6 * line] = dst_2_6_10_14[1];<br>
-        dst[10 * line] = dst_2_6_10_14[2];<br>
-        dst[14 * line] = dst_2_6_10_14[3];<br>
+        __m128i dst_2_6_10_14 = _mm_setr_epi32(dst_two, dst_six, dst_ten, dst_fourteen);<br>
+        dst_2_6_10_14 = _mm_add_epi32(dst_2_6_10_14, _mm_set1_epi32(add));<br>
+        dst_2_6_10_14 = _mm_srai_epi32(dst_2_6_10_14, shift);<br>
<br>
-        Vec4i dst_tmp1_first_half = one_row_first_half * O_first_half;<br>
-        Vec4i dst_tmp1_second_half = one_row_second_half * O_second_half;<br>
-        Vec4i dst_tmp3_first_half = three_row_first_half * O_first_half;<br>
-        Vec4i dst_tmp3_second_half = three_row_second_half * O_second_half;<br>
-        Vec4i dst_tmp5_first_half = five_row_first_half * O_first_half;<br>
-        Vec4i dst_tmp5_second_half = five_row_second_half * O_second_half;<br>
-        Vec4i dst_tmp7_first_half = seven_row_first_half * O_first_half;<br>
-        Vec4i dst_tmp7_second_half = seven_row_second_half * O_second_half;<br>
-        Vec4i dst_tmp9_first_half = nine_row_first_half * O_first_half;<br>
-        Vec4i dst_tmp9_second_half = nine_row_second_half * O_second_half;<br>
-        Vec4i dst_tmp11_first_half = eleven_row_first_half * O_first_half;<br>
-        Vec4i dst_tmp11_second_half = eleven_row_second_half * O_second_half;<br>
-        Vec4i dst_tmp13_first_half = thirteen_row_first_half * O_first_half;<br>
-        Vec4i dst_tmp13_second_half = thirteen_row_second_half * O_second_half;<br>
-        Vec4i dst_tmp15_first_half = fifteen_row_first_half * O_first_half;<br>
-        Vec4i dst_tmp15_second_half = fifteen_row_second_half * O_second_half;<br>
+        dst[2 * line] = _mm_cvtsi128_si32(dst_2_6_10_14);<br>
+        dst[6 * line] = _mm_cvtsi128_si32(_mm_shuffle_epi32(dst_2_6_10_14, 1));<br>
+        dst[10 * line] = _mm_cvtsi128_si32(_mm_shuffle_epi32(dst_2_6_10_14, 2));<br>
+        dst[14 * line] = _mm_cvtsi128_si32(_mm_shuffle_epi32(dst_2_6_10_14, 3));<br>
<br>
-        int dst_one = horizontal_add(dst_tmp1_first_half) + horizontal_add(dst_tmp1_second_half);<br>
-        int dst_three = horizontal_add(dst_tmp3_first_half) + horizontal_add(dst_tmp3_second_half);<br>
-        int dst_five = horizontal_add(dst_tmp5_first_half) + horizontal_add(dst_tmp5_second_half);<br>
-        int dst_seven = horizontal_add(dst_tmp7_first_half) + horizontal_add(dst_tmp7_second_half);<br>
-        int dst_nine = horizontal_add(dst_tmp9_first_half) + horizontal_add(dst_tmp9_second_half);<br>
-        int dst_eleven = horizontal_add(dst_tmp11_first_half) + horizontal_add(dst_tmp11_second_half);<br>
-        int dst_thirteen = horizontal_add(dst_tmp13_first_half) + horizontal_add(dst_tmp13_second_half);<br>
-        int dst_fifteen = horizontal_add(dst_tmp15_first_half) + horizontal_add(dst_tmp15_second_half);<br>
+        __m128i dst_tmp1_first_half = _mm_mullo_epi32(one_row_first_half, O_first_half);<br>
+        __m128i dst_tmp1_second_half = _mm_mullo_epi32(one_row_second_half, O_second_half);<br>
+        __m128i dst_tmp3_first_half = _mm_mullo_epi32(three_row_first_half, O_first_half);<br>
+        __m128i dst_tmp3_second_half = _mm_mullo_epi32(three_row_second_half, O_second_half);<br>
+        __m128i dst_tmp5_first_half = _mm_mullo_epi32(five_row_first_half, O_first_half);<br>
+        __m128i dst_tmp5_second_half = _mm_mullo_epi32(five_row_second_half, O_second_half);<br>
+        __m128i dst_tmp7_first_half = _mm_mullo_epi32(seven_row_first_half, O_first_half);<br>
+        __m128i dst_tmp7_second_half = _mm_mullo_epi32(seven_row_second_half, O_second_half);<br>
+        __m128i dst_tmp9_first_half = _mm_mullo_epi32(nine_row_first_half, O_first_half);<br>
+        __m128i dst_tmp9_second_half = _mm_mullo_epi32(nine_row_second_half, O_second_half);<br>
+        __m128i dst_tmp11_first_half = _mm_mullo_epi32(eleven_row_first_half, O_first_half);<br>
+        __m128i dst_tmp11_second_half = _mm_mullo_epi32(eleven_row_second_half, O_second_half);<br>
+        __m128i dst_tmp13_first_half = _mm_mullo_epi32(thirteen_row_first_half, O_first_half);<br>
+        __m128i dst_tmp13_second_half = _mm_mullo_epi32(thirteen_row_second_half, O_second_half);<br>
+        __m128i dst_tmp15_first_half = _mm_mullo_epi32(fifteen_row_first_half, O_first_half);<br>
+        __m128i dst_tmp15_second_half = _mm_mullo_epi32(fifteen_row_second_half, O_second_half);<br>
<br>
-        Vec4i dst_1_3_5_7(dst_one, dst_three, dst_five, dst_seven);<br>
-        dst_1_3_5_7 = dst_1_3_5_7 + add;<br>
-        dst_1_3_5_7 = dst_1_3_5_7 >> shift;<br>
+        int dst_one = _mm_cvtsi128_si32(_mm_hadd_epi32(_mm_hadd_epi32(_mm_add_epi32(dst_tmp1_first_half, dst_tmp1_second_half), _mm_setzero_si128()), _mm_setzero_si128()));<br>
+        int dst_three = _mm_cvtsi128_si32(_mm_hadd_epi32(_mm_hadd_epi32(_mm_add_epi32(dst_tmp3_first_half, dst_tmp3_second_half), _mm_setzero_si128()), _mm_setzero_si128()));<br>
+        int dst_five = _mm_cvtsi128_si32(_mm_hadd_epi32(_mm_hadd_epi32(_mm_add_epi32(dst_tmp5_first_half, dst_tmp5_second_half), _mm_setzero_si128()), _mm_setzero_si128()));<br>
+        int dst_seven = _mm_cvtsi128_si32(_mm_hadd_epi32(_mm_hadd_epi32(_mm_add_epi32(dst_tmp7_first_half, dst_tmp7_second_half), _mm_setzero_si128()), _mm_setzero_si128()));<br>
+        int dst_nine = _mm_cvtsi128_si32(_mm_hadd_epi32(_mm_hadd_epi32(_mm_add_epi32(dst_tmp9_first_half, dst_tmp9_second_half), _mm_setzero_si128()), _mm_setzero_si128()));<br>
+        int dst_eleven = _mm_cvtsi128_si32(_mm_hadd_epi32(_mm_hadd_epi32(_mm_add_epi32(dst_tmp11_first_half, dst_tmp11_second_half), _mm_setzero_si128()), _mm_setzero_si128()));<br>
+        int dst_thirteen = _mm_cvtsi128_si32(_mm_hadd_epi32(_mm_hadd_epi32(_mm_add_epi32(dst_tmp13_first_half, dst_tmp13_second_half), _mm_setzero_si128()), _mm_setzero_si128()));<br>
+        int dst_fifteen = _mm_cvtsi128_si32(_mm_hadd_epi32(_mm_hadd_epi32(_mm_add_epi32(dst_tmp15_first_half, dst_tmp15_second_half), _mm_setzero_si128()), _mm_setzero_si128()));<br>
<br>
-        Vec4i dst_9_11_13_15(dst_nine, dst_eleven, dst_thirteen, dst_fifteen);<br>
-        dst_9_11_13_15 = dst_9_11_13_15 + add;<br>
-        dst_9_11_13_15 = dst_9_11_13_15 >> shift;<br>
+        __m128i dst_1_3_5_7 = _mm_setr_epi32(dst_one, dst_three, dst_five, dst_seven);<br>
+        dst_1_3_5_7 = _mm_add_epi32(dst_1_3_5_7, _mm_set1_epi32(add));<br>
+        dst_1_3_5_7 = _mm_srai_epi32(dst_1_3_5_7, shift);<br>
<br>
-        dst[1 * line] = dst_1_3_5_7[0];<br>
-        dst[3 * line] = dst_1_3_5_7[1];<br>
-        dst[5 * line] = dst_1_3_5_7[2];<br>
-        dst[7 * line] = dst_1_3_5_7[3];<br>
-        dst[9 * line] = dst_9_11_13_15[0];<br>
-        dst[11 * line] = dst_9_11_13_15[1];<br>
-        dst[13 * line] = dst_9_11_13_15[2];<br>
-        dst[15 * line] = dst_9_11_13_15[3];<br>
+        __m128i dst_9_11_13_15 = _mm_setr_epi32(dst_nine, dst_eleven, dst_thirteen, dst_fifteen);<br>
+        dst_9_11_13_15 = _mm_add_epi32(dst_9_11_13_15, _mm_set1_epi32(add));<br>
+        dst_9_11_13_15 = _mm_srai_epi32(dst_9_11_13_15, shift);<br>
+<br>
+        dst[1 * line] = _mm_cvtsi128_si32(dst_1_3_5_7);<br>
+        dst[3 * line] = _mm_cvtsi128_si32(_mm_shuffle_epi32(dst_1_3_5_7, 1));<br>
+        dst[5 * line] = _mm_cvtsi128_si32(_mm_shuffle_epi32(dst_1_3_5_7, 2));<br>
+        dst[7 * line] = _mm_cvtsi128_si32(_mm_shuffle_epi32(dst_1_3_5_7, 3));<br>
+        dst[9 * line] = _mm_cvtsi128_si32(dst_9_11_13_15);<br>
+        dst[11 * line] = _mm_cvtsi128_si32(_mm_shuffle_epi32(dst_9_11_13_15, 1));<br>
+        dst[13 * line] = _mm_cvtsi128_si32(_mm_shuffle_epi32(dst_9_11_13_15, 2));<br>
+        dst[15 * line] = _mm_cvtsi128_si32(_mm_shuffle_epi32(dst_9_11_13_15, 3));<br>
<br>
         src += 16;<br>
         dst++;<br>
@@ -1899,15 +1902,9 @@<br>
     partialButterfly16(block, coef, shift_1st, 16);<br>
     partialButterfly16(coef, block, shift_2nd, 16);<br>
<br>
-    /* TODO: inline cvt16to32 once it is intrinsic based */<br>
 #define N (16)<br>
-    for (int i = 0; i < N; i++)<br>
-    {<br>
-        for (int j = 0; j < N; j++)<br>
-        {<br>
-            dst[i * N + j] = block[i * N + j];<br>
-        }<br>
-    }<br>
+<br>
+    convert16to32(block, dst, N*N);<br>
<br>
 #undef N<br>
 }<br>
_______________________________________________<br>
x265-devel mailing list<br>
<a href="mailto:x265-devel@videolan.org">x265-devel@videolan.org</a><br>
<a href="https://mailman.videolan.org/listinfo/x265-devel" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a><br>
</blockquote></div><br><br clear="all"><div><br></div>-- <br>Steve Borho
</div></div>