<div dir="ltr"><br><div class="gmail_extra"><br><br><div class="gmail_quote">On Fri, Oct 11, 2013 at 6:25 AM, <span dir="ltr"><<a href="mailto:yuvaraj@multicorewareinc.com" target="_blank">yuvaraj@multicorewareinc.com</a>></span> wrote:<br>
<blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"># HG changeset patch<br>
# User Yuvaraj Venkatesh <<a href="mailto:yuvaraj@multicorewareinc.com">yuvaraj@multicorewareinc.com</a>><br>
# Date 1381490578 -19800<br>
# Fri Oct 11 16:52:58 2013 +0530<br>
# Node ID f1e462de1341b222ceb4295129426b93b3afdeca<br>
# Parent 46b954edb1c52a557b9d94c4ed380ea0578c1949<br>
dct: Replaced partialButterfly32 vector class function to intrinsic<br></blockquote><div><br></div><div>This one is also slower by 50%</div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
diff -r 46b954edb1c5 -r f1e462de1341 source/common/vec/dct-sse3.cpp<br>
--- a/source/common/vec/dct-sse3.cpp Fri Oct 11 14:09:28 2013 +0530<br>
+++ b/source/common/vec/dct-sse3.cpp Fri Oct 11 16:52:58 2013 +0530<br>
@@ -1914,209 +1914,212 @@<br>
int j;<br>
int add = 1 << (shift - 1);<br>
<br>
- Vec4i zero_row_first_two(64, 64, 0, 0);<br>
- Vec4i eight_row_first_two(83, 36, 0, 0);<br>
- Vec4i sixten_row_first_two(64, -64, 0, 0);<br>
- Vec4i twentyfour_row_first_two(36, -83, 0, 0);<br>
+ __m128i zero_row_first_two = _mm_setr_epi32(64, 64, 0, 0);<br>
+ __m128i eight_row_first_two = _mm_setr_epi32(83, 36, 0, 0);<br>
+ __m128i sixten_row_first_two = _mm_setr_epi32(64, -64, 0, 0);<br>
+ __m128i twentyfour_row_first_two = _mm_setr_epi32(36, -83, 0, 0);<br>
<br>
- Vec4i four_row_first_four(89, 75, 50, 18);<br>
- Vec4i twelve_row_first_four(75, -18, -89, -50);<br>
- Vec4i twenty_row_first_four(50, -89, 18, 75);<br>
- Vec4i twentyeight_row_first_four(18, -50, 75, -89);<br>
+ __m128i four_row_first_four = _mm_setr_epi32(89, 75, 50, 18);<br>
+ __m128i twelve_row_first_four = _mm_setr_epi32(75, -18, -89, -50);<br>
+ __m128i twenty_row_first_four = _mm_setr_epi32(50, -89, 18, 75);<br>
+ __m128i twentyeight_row_first_four = _mm_setr_epi32(18, -50, 75, -89);<br>
<br>
- Vec4i two_row_first_four(90, 87, 80, 70);<br>
- Vec4i two_row_second_four(57, 43, 25, 9);<br>
- Vec4i six_row_first_four(87, 57, 9, -43);<br>
- Vec4i six_row_second_four(-80, -90, -70, -25);<br>
- Vec4i ten_row_first_four(80, 9, -70, -87);<br>
- Vec4i ten_row_second_four(-25, 57, 90, 43);<br>
- Vec4i fourteen_row_first_four(70, -43, -87, 9);<br>
- Vec4i fourteen_row_second_four(90, 25, -80, -57);<br>
- Vec4i eighteen_row_first_four(57, -80, -25, 90);<br>
- Vec4i eighteen_row_second_four(-9, -87, 43, 70);<br>
- Vec4i twentytwo_row_first_four(43, -90, 57, 25);<br>
- Vec4i twentytwo_row_second_four(-87, 70, 9, -80);<br>
- Vec4i twentysix_row_first_four(25, -70, 90, -80);<br>
- Vec4i twentysix_row_second_four(43, 9, -57, 87);<br>
- Vec4i thirty_row_first_four(9, -25, 43, -57);<br>
- Vec4i thirty_row_second_four(70, -80, 87, -90);<br>
+ __m128i two_row_first_four = _mm_setr_epi32(90, 87, 80, 70);<br>
+ __m128i two_row_second_four = _mm_setr_epi32(57, 43, 25, 9);<br>
+ __m128i six_row_first_four = _mm_setr_epi32(87, 57, 9, -43);<br>
+ __m128i six_row_second_four = _mm_setr_epi32(-80, -90, -70, -25);<br>
+ __m128i ten_row_first_four = _mm_setr_epi32(80, 9, -70, -87);<br>
+ __m128i ten_row_second_four = _mm_setr_epi32(-25, 57, 90, 43);<br>
+ __m128i fourteen_row_first_four = _mm_setr_epi32(70, -43, -87, 9);<br>
+ __m128i fourteen_row_second_four = _mm_setr_epi32(90, 25, -80, -57);<br>
+ __m128i eighteen_row_first_four = _mm_setr_epi32(57, -80, -25, 90);<br>
+ __m128i eighteen_row_second_four = _mm_setr_epi32(-9, -87, 43, 70);<br>
+ __m128i twentytwo_row_first_four = _mm_setr_epi32(43, -90, 57, 25);<br>
+ __m128i twentytwo_row_second_four = _mm_setr_epi32(-87, 70, 9, -80);<br>
+ __m128i twentysix_row_first_four = _mm_setr_epi32(25, -70, 90, -80);<br>
+ __m128i twentysix_row_second_four = _mm_setr_epi32(43, 9, -57, 87);<br>
+ __m128i thirty_row_first_four = _mm_setr_epi32(9, -25, 43, -57);<br>
+ __m128i thirty_row_second_four = _mm_setr_epi32(70, -80, 87, -90);<br>
<br>
- Vec4i one_row_first_four(90, 90, 88, 85);<br>
- Vec4i one_row_second_four(82, 78, 73, 67);<br>
- Vec4i one_row_third_four(61, 54, 46, 38);<br>
- Vec4i one_row_fourth_four(31, 22, 13, 4);<br>
+ __m128i one_row_first_four = _mm_setr_epi32(90, 90, 88, 85);<br>
+ __m128i one_row_second_four = _mm_setr_epi32(82, 78, 73, 67);<br>
+ __m128i one_row_third_four = _mm_setr_epi32(61, 54, 46, 38);<br>
+ __m128i one_row_fourth_four = _mm_setr_epi32(31, 22, 13, 4);<br>
<br>
- Vec4i three_row_first_four(90, 82, 67, 46);<br>
- Vec4i three_row_second_four(22, -4, -31, -54);<br>
- Vec4i three_row_third_four(-73, -85, -90, -88);<br>
- Vec4i three_row_fourth_four(-78, -61, -38, -13);<br>
+ __m128i three_row_first_four = _mm_setr_epi32(90, 82, 67, 46);<br>
+ __m128i three_row_second_four = _mm_setr_epi32(22, -4, -31, -54);<br>
+ __m128i three_row_third_four = _mm_setr_epi32(-73, -85, -90, -88);<br>
+ __m128i three_row_fourth_four = _mm_setr_epi32(-78, -61, -38, -13);<br>
<br>
- Vec4i five_row_first_four(88, 67, 31, -13);<br>
- Vec4i five_row_second_four(-54, -82, -90, -78);<br>
- Vec4i five_row_third_four(-46, -4, 38, 73);<br>
- Vec4i five_row_fourth_four(90, 85, 61, 22);<br>
+ __m128i five_row_first_four = _mm_setr_epi32(88, 67, 31, -13);<br>
+ __m128i five_row_second_four = _mm_setr_epi32(-54, -82, -90, -78);<br>
+ __m128i five_row_third_four = _mm_setr_epi32(-46, -4, 38, 73);<br>
+ __m128i five_row_fourth_four = _mm_setr_epi32(90, 85, 61, 22);<br>
<br>
- Vec4i seven_row_first_four(85, 46, -13, -67);<br>
- Vec4i seven_row_second_four(-90, -73, -22, 38);<br>
- Vec4i seven_row_third_four(82, 88, 54, -4);<br>
- Vec4i seven_row_fourth_four(-61, -90, -78, -31);<br>
+ __m128i seven_row_first_four = _mm_setr_epi32(85, 46, -13, -67);<br>
+ __m128i seven_row_second_four = _mm_setr_epi32(-90, -73, -22, 38);<br>
+ __m128i seven_row_third_four = _mm_setr_epi32(82, 88, 54, -4);<br>
+ __m128i seven_row_fourth_four = _mm_setr_epi32(-61, -90, -78, -31);<br>
<br>
- Vec4i nine_row_first_four(82, 22, -54, -90);<br>
- Vec4i nine_row_second_four(-61, 13, 78, 85);<br>
- Vec4i nine_row_third_four(31, -46, -90, -67);<br>
- Vec4i nine_row_fourth_four(4, 73, 88, 38);<br>
+ __m128i nine_row_first_four = _mm_setr_epi32(82, 22, -54, -90);<br>
+ __m128i nine_row_second_four = _mm_setr_epi32(-61, 13, 78, 85);<br>
+ __m128i nine_row_third_four = _mm_setr_epi32(31, -46, -90, -67);<br>
+ __m128i nine_row_fourth_four = _mm_setr_epi32(4, 73, 88, 38);<br>
<br>
- Vec4i eleven_row_first_four(78, -4, -82, -73);<br>
- Vec4i eleven_row_second_four(13, 85, 67, -22);<br>
- Vec4i eleven_row_third_four(-88, -61, 31, 90);<br>
- Vec4i eleven_row_fourth_four(54, -38, -90, -46);<br>
+ __m128i eleven_row_first_four = _mm_setr_epi32(78, -4, -82, -73);<br>
+ __m128i eleven_row_second_four = _mm_setr_epi32(13, 85, 67, -22);<br>
+ __m128i eleven_row_third_four = _mm_setr_epi32(-88, -61, 31, 90);<br>
+ __m128i eleven_row_fourth_four = _mm_setr_epi32(54, -38, -90, -46);<br>
<br>
- Vec4i thirteen_row_first_four(73, -31, -90, -22);<br>
- Vec4i thirteen_row_second_four(78, 67, -38, -90);<br>
- Vec4i thirteen_row_third_four(-13, 82, 61, -46);<br>
- Vec4i thirteen_row_fourth_four(-88, -4, 85, 54);<br>
+ __m128i thirteen_row_first_four = _mm_setr_epi32(73, -31, -90, -22);<br>
+ __m128i thirteen_row_second_four = _mm_setr_epi32(78, 67, -38, -90);<br>
+ __m128i thirteen_row_third_four = _mm_setr_epi32(-13, 82, 61, -46);<br>
+ __m128i thirteen_row_fourth_four = _mm_setr_epi32(-88, -4, 85, 54);<br>
<br>
- Vec4i fifteen_row_first_four(67, -54, -78, 38);<br>
- Vec4i fifteen_row_second_four(85, -22, -90, 4);<br>
- Vec4i fifteen_row_third_four(90, 13, -88, -31);<br>
- Vec4i fifteen_row_fourth_four(82, 46, -73, -61);<br>
+ __m128i fifteen_row_first_four = _mm_setr_epi32(67, -54, -78, 38);<br>
+ __m128i fifteen_row_second_four = _mm_setr_epi32(85, -22, -90, 4);<br>
+ __m128i fifteen_row_third_four = _mm_setr_epi32(90, 13, -88, -31);<br>
+ __m128i fifteen_row_fourth_four = _mm_setr_epi32(82, 46, -73, -61);<br>
<br>
- Vec4i seventeen_row_first_four(61, -73, -46, 82);<br>
- Vec4i seventeen_row_second_four(31, -88, -13, 90);<br>
- Vec4i seventeen_row_third_four(-4, -90, 22, 85);<br>
- Vec4i seventeen_row_fourth_four(-38, -78, 54, 67);<br>
+ __m128i seventeen_row_first_four = _mm_setr_epi32(61, -73, -46, 82);<br>
+ __m128i seventeen_row_second_four = _mm_setr_epi32(31, -88, -13, 90);<br>
+ __m128i seventeen_row_third_four = _mm_setr_epi32(-4, -90, 22, 85);<br>
+ __m128i seventeen_row_fourth_four = _mm_setr_epi32(-38, -78, 54, 67);<br>
<br>
- Vec4i nineteen_row_first_four(54, -85, -4, 88);<br>
- Vec4i nineteen_row_second_four(-46, -61, 82, 13);<br>
- Vec4i nineteen_row_third_four(-90, 38, 67, -78);<br>
- Vec4i nineteen_row_fourth_four(-22, 90, -31, -73);<br>
+ __m128i nineteen_row_first_four = _mm_setr_epi32(54, -85, -4, 88);<br>
+ __m128i nineteen_row_second_four = _mm_setr_epi32(-46, -61, 82, 13);<br>
+ __m128i nineteen_row_third_four = _mm_setr_epi32(-90, 38, 67, -78);<br>
+ __m128i nineteen_row_fourth_four = _mm_setr_epi32(-22, 90, -31, -73);<br>
<br>
- Vec4i twentyone_row_first_four(46, -90, 38, 54);<br>
- Vec4i twentyone_row_second_four(-90, 31, 61, -88);<br>
- Vec4i twentyone_row_third_four(22, 67, -85, 13);<br>
- Vec4i twentyone_row_fourth_four(73, -82, 4, 78);<br>
+ __m128i twentyone_row_first_four = _mm_setr_epi32(46, -90, 38, 54);<br>
+ __m128i twentyone_row_second_four = _mm_setr_epi32(-90, 31, 61, -88);<br>
+ __m128i twentyone_row_third_four = _mm_setr_epi32(22, 67, -85, 13);<br>
+ __m128i twentyone_row_fourth_four = _mm_setr_epi32(73, -82, 4, 78);<br>
<br>
- Vec4i twentythree_row_first_four(38, -88, 73, -4);<br>
- Vec4i twentythree_row_second_four(-67, 90, -46, -31);<br>
- Vec4i twentythree_row_third_four(85, -78, 13, 61);<br>
- Vec4i twentythree_row_fourth_four(-90, 54, 22, -82);<br>
+ __m128i twentythree_row_first_four = _mm_setr_epi32(38, -88, 73, -4);<br>
+ __m128i twentythree_row_second_four = _mm_setr_epi32(-67, 90, -46, -31);<br>
+ __m128i twentythree_row_third_four = _mm_setr_epi32(85, -78, 13, 61);<br>
+ __m128i twentythree_row_fourth_four = _mm_setr_epi32(-90, 54, 22, -82);<br>
<br>
- Vec4i twentyfive_row_first_four(31, -78, 90, -61);<br>
- Vec4i twentyfive_row_second_four(4, 54, -88, 82);<br>
- Vec4i twentyfive_row_third_four(-38, -22, 73, -90);<br>
- Vec4i twentyfive_row_fourth_four(67, -13, -46, 85);<br>
+ __m128i twentyfive_row_first_four = _mm_setr_epi32(31, -78, 90, -61);<br>
+ __m128i twentyfive_row_second_four = _mm_setr_epi32(4, 54, -88, 82);<br>
+ __m128i twentyfive_row_third_four = _mm_setr_epi32(-38, -22, 73, -90);<br>
+ __m128i twentyfive_row_fourth_four = _mm_setr_epi32(67, -13, -46, 85);<br>
<br>
- Vec4i twentyseven_row_first_four(22, -61, 85, -90);<br>
- Vec4i twentyseven_row_second_four(73, -38, -4, 46);<br>
- Vec4i twentyseven_row_third_four(-78, 90, -82, 54);<br>
- Vec4i twentyseven_row_fourth_four(-13, -31, 67, -88);<br>
+ __m128i twentyseven_row_first_four = _mm_setr_epi32(22, -61, 85, -90);<br>
+ __m128i twentyseven_row_second_four = _mm_setr_epi32(73, -38, -4, 46);<br>
+ __m128i twentyseven_row_third_four = _mm_setr_epi32(-78, 90, -82, 54);<br>
+ __m128i twentyseven_row_fourth_four = _mm_setr_epi32(-13, -31, 67, -88);<br>
<br>
- Vec4i twentynine_row_first_four(13, -38, 61, -78);<br>
- Vec4i twentynine_row_second_four(88, -90, 85, -73);<br>
- Vec4i twentynine_row_third_four(54, -31, 4, 22);<br>
- Vec4i twentynine_row_fourth_four(-46, 67, -82, 90);<br>
+ __m128i twentynine_row_first_four = _mm_setr_epi32(13, -38, 61, -78);<br>
+ __m128i twentynine_row_second_four = _mm_setr_epi32(88, -90, 85, -73);<br>
+ __m128i twentynine_row_third_four = _mm_setr_epi32(54, -31, 4, 22);<br>
+ __m128i twentynine_row_fourth_four = _mm_setr_epi32(-46, 67, -82, 90);<br>
<br>
- Vec4i thirtyone_row_first_four(4, -13, 22, -31);<br>
- Vec4i thirtyone_row_second_four(38, -46, 54, -61);<br>
- Vec4i thirtyone_row_third_four(67, -73, 78, -82);<br>
- Vec4i thirtyone_row_fourth_four(85, -88, 90, -90);<br>
+ __m128i thirtyone_row_first_four = _mm_setr_epi32(4, -13, 22, -31);<br>
+ __m128i thirtyone_row_second_four = _mm_setr_epi32(38, -46, 54, -61);<br>
+ __m128i thirtyone_row_third_four = _mm_setr_epi32(67, -73, 78, -82);<br>
+ __m128i thirtyone_row_fourth_four = _mm_setr_epi32(85, -88, 90, -90);<br>
<br>
for (j = 0; j < line; j++)<br>
{<br>
- Vec8s tmp1, tmp2, tmp3, tmp4;<br>
+ __m128i tmp1, tmp2, tmp3, tmp4;<br>
<br>
- tmp1.load(src);<br>
- Vec4i tmp1_first_half = extend_low(tmp1);<br>
- Vec4i tmp1_second_half = extend_high(tmp1);<br>
+ tmp1 = _mm_loadu_si128((__m128i*)(src));<br>
<br>
- tmp2.load(src + 8);<br>
- Vec4i tmp2_first_half = extend_low(tmp2);<br>
- Vec4i tmp2_second_half = extend_high(tmp2);<br>
+ __m128i sign = _mm_srai_epi16(tmp1, 15);<br>
+ __m128i tmp1_first_half = _mm_unpacklo_epi16(tmp1, sign);<br>
+ __m128i tmp1_second_half = _mm_unpackhi_epi16(tmp1, sign);<br>
<br>
- tmp3.load(src + 16);<br>
- Vec4i tmp3_first_half_tmp = extend_low(tmp3);<br>
- Vec4i tmp3_second_half_tmp = extend_high(tmp3);<br>
- Vec4i tmp3_first_half = permute4i<3, 2, 1, 0>(tmp3_first_half_tmp);<br>
- Vec4i tmp3_second_half = permute4i<3, 2, 1, 0>(tmp3_second_half_tmp);<br>
+ tmp2 = _mm_loadu_si128((__m128i*)(src + 8));<br>
+ sign = _mm_srai_epi16(tmp2, 15);<br>
+ __m128i tmp2_first_half = _mm_unpacklo_epi16(tmp2, sign);<br>
+ __m128i tmp2_second_half = _mm_unpackhi_epi16(tmp2, sign);<br>
<br>
- tmp4.load(src + 24);<br>
- Vec4i tmp4_first_half_tmp = extend_low(tmp4);<br>
- Vec4i tmp4_second_half_tmp = extend_high(tmp4);<br>
- Vec4i tmp4_first_half = permute4i<3, 2, 1, 0>(tmp4_first_half_tmp);<br>
- Vec4i tmp4_second_half = permute4i<3, 2, 1, 0>(tmp4_second_half_tmp);<br>
+ tmp3 = _mm_loadu_si128((__m128i*)(src + 16));<br>
+ sign = _mm_srai_epi16(tmp3, 15);<br>
+ __m128i tmp3_first_half_tmp = _mm_unpacklo_epi16(tmp3, sign);<br>
+ __m128i tmp3_second_half_tmp = _mm_unpackhi_epi16(tmp3, sign);<br>
+ __m128i tmp3_first_half = _mm_shuffle_epi32(tmp3_first_half_tmp, 27);<br>
+ __m128i tmp3_second_half = _mm_shuffle_epi32(tmp3_second_half_tmp, 27);<br>
<br>
- Vec4i E_first_four = tmp1_first_half + tmp4_second_half;<br>
- Vec4i E_second_four = tmp1_second_half + tmp4_first_half;<br>
- Vec4i E_third_four = tmp2_first_half + tmp3_second_half;<br>
- Vec4i E_last_four = tmp2_second_half + tmp3_first_half;<br>
+ tmp4 = _mm_loadu_si128((__m128i*)(src + 24));<br>
+ sign = _mm_srai_epi16(tmp4, 15);<br>
+ __m128i tmp4_first_half_tmp = _mm_unpacklo_epi16(tmp4, sign);<br>
+ __m128i tmp4_second_half_tmp = _mm_unpackhi_epi16(tmp4, sign);<br>
+ __m128i tmp4_first_half = _mm_shuffle_epi32(tmp4_first_half_tmp, 27);<br>
+ __m128i tmp4_second_half = _mm_shuffle_epi32(tmp4_second_half_tmp, 27);<br>
<br>
- Vec4i O_first_four = tmp1_first_half - tmp4_second_half;<br>
- Vec4i O_second_four = tmp1_second_half - tmp4_first_half;<br>
- Vec4i O_third_four = tmp2_first_half - tmp3_second_half;<br>
- Vec4i O_last_four = tmp2_second_half - tmp3_first_half;<br>
+ __m128i E_first_four = _mm_add_epi32(tmp1_first_half, tmp4_second_half);<br>
+ __m128i E_second_four = _mm_add_epi32(tmp1_second_half, tmp4_first_half);<br>
+ __m128i E_third_four = _mm_add_epi32(tmp2_first_half, tmp3_second_half);<br>
+ __m128i E_last_four = _mm_add_epi32(tmp2_second_half, tmp3_first_half);<br>
<br>
- Vec4i E_last_four_rev = permute4i<3, 2, 1, 0>(E_last_four);<br>
- Vec4i E_third_four_rev = permute4i<3, 2, 1, 0>(E_third_four);<br>
+ __m128i O_first_four = _mm_sub_epi32(tmp1_first_half, tmp4_second_half);<br>
+ __m128i O_second_four = _mm_sub_epi32(tmp1_second_half, tmp4_first_half);<br>
+ __m128i O_third_four = _mm_sub_epi32(tmp2_first_half, tmp3_second_half);<br>
+ __m128i O_last_four = _mm_sub_epi32(tmp2_second_half, tmp3_first_half);<br>
<br>
- Vec4i EE_first_four = E_first_four + E_last_four_rev;<br>
- Vec4i EE_last_four = E_second_four + E_third_four_rev;<br>
- Vec4i EO_first_four = E_first_four - E_last_four_rev;<br>
- Vec4i EO_last_four = E_second_four - E_third_four_rev;<br>
+ __m128i E_last_four_rev = _mm_shuffle_epi32(E_last_four, 27);<br>
+ __m128i E_third_four_rev = _mm_shuffle_epi32(E_third_four, 27);<br>
<br>
- Vec4i EE_last_four_rev = permute4i<3, 2, 1, 0>(EE_last_four);<br>
+ __m128i EE_first_four = _mm_add_epi32(E_first_four, E_last_four_rev);<br>
+ __m128i EE_last_four = _mm_add_epi32(E_second_four, E_third_four_rev);<br>
+ __m128i EO_first_four = _mm_sub_epi32(E_first_four, E_last_four_rev);<br>
+ __m128i EO_last_four = _mm_sub_epi32(E_second_four, E_third_four_rev);<br>
<br>
- Vec4i EEE = EE_first_four + EE_last_four_rev;<br>
- Vec4i EEO = EE_first_four - EE_last_four_rev;<br>
+ __m128i EE_last_four_rev = _mm_shuffle_epi32(EE_last_four, 27);<br>
<br>
- Vec4i EEEE_first_half = permute4i<0, 1, -1, -1>(EEE);<br>
- Vec4i EEEE_second_half = permute4i<3, 2, -1, -1>(EEE);<br>
- Vec4i EEEE = EEEE_first_half + EEEE_second_half;<br>
- Vec4i EEEO = EEEE_first_half - EEEE_second_half;<br>
+ __m128i EEE = _mm_add_epi32(EE_first_four, EE_last_four_rev);<br>
+ __m128i EEO = _mm_sub_epi32(EE_first_four, EE_last_four_rev);<br>
<br>
- int dst0_hresult = (horizontal_add(zero_row_first_two * EEEE) + add) >> shift;<br>
- int dst8_hresult = (horizontal_add(eight_row_first_two * EEEO) + add) >> shift;<br>
- int dst16_hresult = (horizontal_add(sixten_row_first_two * EEEE) + add) >> shift;<br>
- int dst24_hresult = (horizontal_add(twentyfour_row_first_two * EEEO) + add) >> shift;<br>
+ __m128i EEEE_first_half = _mm_shuffle_epi32(EEE, 4);<br>
+ __m128i EEEE_second_half = _mm_shuffle_epi32(EEE, 11);<br>
+ __m128i EEEE = _mm_add_epi32(EEEE_first_half, EEEE_second_half);<br>
+ __m128i EEEO = _mm_sub_epi32(EEEE_first_half, EEEE_second_half);<br>
+<br>
+ int dst0_hresult = (_mm_cvtsi128_si32(_mm_hadd_epi32(_mm_hadd_epi32(_mm_mullo_epi32(zero_row_first_two, EEEE), _mm_setzero_si128()), _mm_setzero_si128())) + add) >> shift;<br>
+ int dst8_hresult = (_mm_cvtsi128_si32(_mm_hadd_epi32(_mm_hadd_epi32(_mm_mullo_epi32(eight_row_first_two, EEEO), _mm_setzero_si128()), _mm_setzero_si128())) + add) >> shift;<br>
+ int dst16_hresult = (_mm_cvtsi128_si32(_mm_hadd_epi32(_mm_hadd_epi32(_mm_mullo_epi32(sixten_row_first_two, EEEE), _mm_setzero_si128()), _mm_setzero_si128())) + add) >> shift;<br>
+ int dst24_hresult = (_mm_cvtsi128_si32(_mm_hadd_epi32(_mm_hadd_epi32(_mm_mullo_epi32(twentyfour_row_first_two, EEEO), _mm_setzero_si128()), _mm_setzero_si128())) + add) >> shift;<br>
<br>
dst[0] = dst0_hresult;<br>
dst[8 * line] = dst8_hresult;<br>
dst[16 * line] = dst16_hresult;<br>
dst[24 * line] = dst24_hresult;<br>
<br>
- int dst4_hresult = (horizontal_add(four_row_first_four * EEO) + add) >> shift;<br>
- int dst12_hresult = (horizontal_add(twelve_row_first_four * EEO) + add) >> shift;<br>
- int dst20_hresult = (horizontal_add(twenty_row_first_four * EEO) + add) >> shift;<br>
- int dst28_hresult = (horizontal_add(twentyeight_row_first_four * EEO) + add) >> shift;<br>
+ int dst4_hresult = (_mm_cvtsi128_si32(_mm_hadd_epi32(_mm_hadd_epi32(_mm_mullo_epi32(four_row_first_four, EEO), _mm_setzero_si128()), _mm_setzero_si128())) + add) >> shift;<br>
+ int dst12_hresult = (_mm_cvtsi128_si32(_mm_hadd_epi32(_mm_hadd_epi32(_mm_mullo_epi32(twelve_row_first_four, EEO), _mm_setzero_si128()), _mm_setzero_si128())) + add) >> shift;<br>
+ int dst20_hresult = (_mm_cvtsi128_si32(_mm_hadd_epi32(_mm_hadd_epi32(_mm_mullo_epi32(twenty_row_first_four, EEO), _mm_setzero_si128()), _mm_setzero_si128())) + add) >> shift;<br>
+ int dst28_hresult = (_mm_cvtsi128_si32(_mm_hadd_epi32(_mm_hadd_epi32(_mm_mullo_epi32(twentyeight_row_first_four, EEO), _mm_setzero_si128()), _mm_setzero_si128())) + add) >> shift;<br>
<br>
dst[4 * line] = dst4_hresult;<br>
dst[12 * line] = dst12_hresult;<br>
dst[20 * line] = dst20_hresult;<br>
dst[28 * line] = dst28_hresult;<br>
+ __m128i tmp = _mm_add_epi32(_mm_mullo_epi32(two_row_first_four, EO_first_four), _mm_mullo_epi32(two_row_second_four, EO_last_four));<br>
+ int dst2_hresult = (_mm_cvtsi128_si32(_mm_hadd_epi32(_mm_hadd_epi32(tmp, _mm_setzero_si128()), _mm_setzero_si128())) + add) >> shift;<br>
<br>
- int dst2_hresult =<br>
- (horizontal_add((two_row_first_four *<br>
- EO_first_four) + (two_row_second_four * EO_last_four)) + add) >> shift;<br>
- int dst6_hresult =<br>
- (horizontal_add((six_row_first_four *<br>
- EO_first_four) + (six_row_second_four * EO_last_four)) + add) >> shift;<br>
- int dst10_hresult =<br>
- (horizontal_add((ten_row_first_four *<br>
- EO_first_four) + (ten_row_second_four * EO_last_four)) + add) >> shift;<br>
- int dst14_hresult =<br>
- (horizontal_add((fourteen_row_first_four *<br>
- EO_first_four) + (fourteen_row_second_four * EO_last_four)) + add) >> shift;<br>
- int dst18_hresult =<br>
- (horizontal_add((eighteen_row_first_four *<br>
- EO_first_four) + (eighteen_row_second_four * EO_last_four)) + add) >> shift;<br>
- int dst22_hresult =<br>
- (horizontal_add((twentytwo_row_first_four *<br>
- EO_first_four) + (twentytwo_row_second_four * EO_last_four)) + add) >> shift;<br>
- int dst26_hresult =<br>
- (horizontal_add((twentysix_row_first_four *<br>
- EO_first_four) + (twentysix_row_second_four * EO_last_four)) + add) >> shift;<br>
- int dst30_hresult =<br>
- (horizontal_add((thirty_row_first_four *<br>
- EO_first_four) + (thirty_row_second_four * EO_last_four)) + add) >> shift;<br>
+ tmp = _mm_add_epi32(_mm_mullo_epi32(six_row_first_four, EO_first_four), _mm_mullo_epi32(six_row_second_four, EO_last_four));<br>
+ int dst6_hresult = (_mm_cvtsi128_si32(_mm_hadd_epi32(_mm_hadd_epi32(tmp, _mm_setzero_si128()), _mm_setzero_si128())) + add) >> shift;<br>
+<br>
+ tmp = _mm_add_epi32(_mm_mullo_epi32(ten_row_first_four, EO_first_four), _mm_mullo_epi32(ten_row_second_four, EO_last_four));<br>
+ int dst10_hresult = (_mm_cvtsi128_si32(_mm_hadd_epi32(_mm_hadd_epi32(tmp, _mm_setzero_si128()), _mm_setzero_si128())) + add) >> shift;<br>
+<br>
+ tmp = _mm_add_epi32(_mm_mullo_epi32(fourteen_row_first_four, EO_first_four), _mm_mullo_epi32(fourteen_row_second_four, EO_last_four));<br>
+ int dst14_hresult = (_mm_cvtsi128_si32(_mm_hadd_epi32(_mm_hadd_epi32(tmp, _mm_setzero_si128()), _mm_setzero_si128())) + add) >> shift;<br>
+<br>
+ tmp = _mm_add_epi32(_mm_mullo_epi32(eighteen_row_first_four, EO_first_four), _mm_mullo_epi32(eighteen_row_second_four, EO_last_four));<br>
+ int dst18_hresult = (_mm_cvtsi128_si32(_mm_hadd_epi32(_mm_hadd_epi32(tmp, _mm_setzero_si128()), _mm_setzero_si128())) + add) >> shift;<br>
+<br>
+ tmp = _mm_add_epi32(_mm_mullo_epi32(twentytwo_row_first_four, EO_first_four), _mm_mullo_epi32(twentytwo_row_second_four, EO_last_four));<br>
+ int dst22_hresult = (_mm_cvtsi128_si32(_mm_hadd_epi32(_mm_hadd_epi32(tmp, _mm_setzero_si128()), _mm_setzero_si128())) + add) >> shift;<br>
+<br>
+ tmp = _mm_add_epi32(_mm_mullo_epi32(twentysix_row_first_four, EO_first_four), _mm_mullo_epi32(twentysix_row_second_four, EO_last_four));<br>
+ int dst26_hresult = (_mm_cvtsi128_si32(_mm_hadd_epi32(_mm_hadd_epi32(tmp, _mm_setzero_si128()), _mm_setzero_si128())) + add) >> shift;<br>
+<br>
+ tmp = _mm_add_epi32(_mm_mullo_epi32(thirty_row_first_four, EO_first_four), _mm_mullo_epi32(thirty_row_second_four, EO_last_four));<br>
+ int dst30_hresult = (_mm_cvtsi128_si32(_mm_hadd_epi32(_mm_hadd_epi32(tmp, _mm_setzero_si128()), _mm_setzero_si128())) + add) >> shift;<br>
<br>
dst[2 * line] = dst2_hresult;<br>
dst[6 * line] = dst6_hresult;<br>
@@ -2127,59 +2130,86 @@<br>
dst[26 * line] = dst26_hresult;<br>
dst[30 * line] = dst30_hresult;<br>
<br>
- Vec4i dst1_temp = (one_row_first_four * O_first_four) + (one_row_second_four * O_second_four) +<br>
- (one_row_third_four * O_third_four) + (one_row_fourth_four * O_last_four);<br>
- Vec4i dst3_temp = (three_row_first_four * O_first_four) + (three_row_second_four * O_second_four) +<br>
- (three_row_third_four * O_third_four) + (three_row_fourth_four * O_last_four);<br>
- Vec4i dst5_temp = (five_row_first_four * O_first_four) + (five_row_second_four * O_second_four) +<br>
- (five_row_third_four * O_third_four) + (five_row_fourth_four * O_last_four);<br>
- Vec4i dst7_temp = (seven_row_first_four * O_first_four) + (seven_row_second_four * O_second_four) +<br>
- (seven_row_third_four * O_third_four) + (seven_row_fourth_four * O_last_four);<br>
- Vec4i dst9_temp = (nine_row_first_four * O_first_four) + (nine_row_second_four * O_second_four) +<br>
- (nine_row_third_four * O_third_four) + (nine_row_fourth_four * O_last_four);<br>
- Vec4i dst11_temp = (eleven_row_first_four * O_first_four) + (eleven_row_second_four * O_second_four) +<br>
- (eleven_row_third_four * O_third_four) + (eleven_row_fourth_four * O_last_four);<br>
- Vec4i dst13_temp = (thirteen_row_first_four * O_first_four) + (thirteen_row_second_four * O_second_four) +<br>
- (thirteen_row_third_four * O_third_four) + (thirteen_row_fourth_four * O_last_four);<br>
- Vec4i dst15_temp = (fifteen_row_first_four * O_first_four) + (fifteen_row_second_four * O_second_four) +<br>
- (fifteen_row_third_four * O_third_four) + (fifteen_row_fourth_four * O_last_four);<br>
- Vec4i dst17_temp = (seventeen_row_first_four * O_first_four) + (seventeen_row_second_four * O_second_four) +<br>
- (seventeen_row_third_four * O_third_four) + (seventeen_row_fourth_four * O_last_four);<br>
- Vec4i dst19_temp = (nineteen_row_first_four * O_first_four) + (nineteen_row_second_four * O_second_four) +<br>
- (nineteen_row_third_four * O_third_four) + (nineteen_row_fourth_four * O_last_four);<br>
- Vec4i dst21_temp = (twentyone_row_first_four * O_first_four) + (twentyone_row_second_four * O_second_four) +<br>
- (twentyone_row_third_four * O_third_four) + (twentyone_row_fourth_four * O_last_four);<br>
- Vec4i dst23_temp =<br>
- (twentythree_row_first_four * O_first_four) + (twentythree_row_second_four * O_second_four) +<br>
- (twentythree_row_third_four * O_third_four) + (twentythree_row_fourth_four * O_last_four);<br>
- Vec4i dst25_temp =<br>
- (twentyfive_row_first_four * O_first_four) + (twentyfive_row_second_four * O_second_four) +<br>
- (twentyfive_row_third_four * O_third_four) + (twentyfive_row_fourth_four * O_last_four);<br>
- Vec4i dst27_temp =<br>
- (twentyseven_row_first_four * O_first_four) + (twentyseven_row_second_four * O_second_four) +<br>
- (twentyseven_row_third_four * O_third_four) + (twentyseven_row_fourth_four * O_last_four);<br>
- Vec4i dst29_temp =<br>
- (twentynine_row_first_four * O_first_four) + (twentynine_row_second_four * O_second_four) +<br>
- (twentynine_row_third_four * O_third_four) + (twentynine_row_fourth_four * O_last_four);<br>
- Vec4i dst31_temp = (thirtyone_row_first_four * O_first_four) + (thirtyone_row_second_four * O_second_four) +<br>
- (thirtyone_row_third_four * O_third_four) + (thirtyone_row_fourth_four * O_last_four);<br>
+ tmp1 = _mm_add_epi32(_mm_mullo_epi32(one_row_first_four, O_first_four), _mm_mullo_epi32(one_row_second_four, O_second_four));<br>
+ tmp2 = _mm_add_epi32(_mm_mullo_epi32(one_row_third_four, O_third_four), _mm_mullo_epi32(one_row_fourth_four, O_last_four));<br>
+ __m128i dst1_temp = _mm_add_epi32(tmp1, tmp2);<br>
<br>
- dst[1 * line] = (horizontal_add(dst1_temp) + add) >> shift;<br>
- dst[3 * line] = (horizontal_add(dst3_temp) + add) >> shift;<br>
- dst[5 * line] = (horizontal_add(dst5_temp) + add) >> shift;<br>
- dst[7 * line] = (horizontal_add(dst7_temp) + add) >> shift;<br>
- dst[9 * line] = (horizontal_add(dst9_temp) + add) >> shift;<br>
- dst[11 * line] = (horizontal_add(dst11_temp) + add) >> shift;<br>
- dst[13 * line] = (horizontal_add(dst13_temp) + add) >> shift;<br>
- dst[15 * line] = (horizontal_add(dst15_temp) + add) >> shift;<br>
- dst[17 * line] = (horizontal_add(dst17_temp) + add) >> shift;<br>
- dst[19 * line] = (horizontal_add(dst19_temp) + add) >> shift;<br>
- dst[21 * line] = (horizontal_add(dst21_temp) + add) >> shift;<br>
- dst[23 * line] = (horizontal_add(dst23_temp) + add) >> shift;<br>
- dst[25 * line] = (horizontal_add(dst25_temp) + add) >> shift;<br>
- dst[27 * line] = (horizontal_add(dst27_temp) + add) >> shift;<br>
- dst[29 * line] = (horizontal_add(dst29_temp) + add) >> shift;<br>
- dst[31 * line] = (horizontal_add(dst31_temp) + add) >> shift;<br>
+ tmp1 = _mm_add_epi32(_mm_mullo_epi32(three_row_first_four, O_first_four), _mm_mullo_epi32(three_row_second_four, O_second_four));<br>
+ tmp2 = _mm_add_epi32(_mm_mullo_epi32(three_row_third_four, O_third_four), _mm_mullo_epi32(three_row_fourth_four, O_last_four));<br>
+ __m128i dst3_temp = _mm_add_epi32(tmp1, tmp2);<br>
+<br>
+ tmp1 = _mm_add_epi32(_mm_mullo_epi32(five_row_first_four, O_first_four), _mm_mullo_epi32(five_row_second_four, O_second_four));<br>
+ tmp2 = _mm_add_epi32(_mm_mullo_epi32(five_row_third_four, O_third_four), _mm_mullo_epi32(five_row_fourth_four, O_last_four));<br>
+ __m128i dst5_temp = _mm_add_epi32(tmp1, tmp2);<br>
+<br>
+ tmp1 = _mm_add_epi32(_mm_mullo_epi32(seven_row_first_four, O_first_four), _mm_mullo_epi32(seven_row_second_four, O_second_four));<br>
+ tmp2 = _mm_add_epi32(_mm_mullo_epi32(seven_row_third_four, O_third_four), _mm_mullo_epi32(seven_row_fourth_four, O_last_four));<br>
+ __m128i dst7_temp = _mm_add_epi32(tmp1, tmp2);<br>
+<br>
+ tmp1 = _mm_add_epi32(_mm_mullo_epi32(nine_row_first_four, O_first_four), _mm_mullo_epi32(nine_row_second_four, O_second_four));<br>
+ tmp2 = _mm_add_epi32(_mm_mullo_epi32(nine_row_third_four, O_third_four), _mm_mullo_epi32(nine_row_fourth_four, O_last_four));<br>
+ __m128i dst9_temp = _mm_add_epi32(tmp1, tmp2);<br>
+<br>
+ tmp1 = _mm_add_epi32(_mm_mullo_epi32(eleven_row_first_four, O_first_four), _mm_mullo_epi32(eleven_row_second_four, O_second_four));<br>
+ tmp2 = _mm_add_epi32(_mm_mullo_epi32(eleven_row_third_four, O_third_four), _mm_mullo_epi32(eleven_row_fourth_four, O_last_four));<br>
+ __m128i dst11_temp = _mm_add_epi32(tmp1, tmp2);<br>
+<br>
+ tmp1 = _mm_add_epi32(_mm_mullo_epi32(thirteen_row_first_four, O_first_four), _mm_mullo_epi32(thirteen_row_second_four, O_second_four));<br>
+ tmp2 = _mm_add_epi32(_mm_mullo_epi32(thirteen_row_third_four, O_third_four), _mm_mullo_epi32(thirteen_row_fourth_four, O_last_four));<br>
+ __m128i dst13_temp = _mm_add_epi32(tmp1, tmp2);<br>
+<br>
+ tmp1 = _mm_add_epi32(_mm_mullo_epi32(fifteen_row_first_four, O_first_four), _mm_mullo_epi32(fifteen_row_second_four, O_second_four));<br>
+ tmp2 = _mm_add_epi32(_mm_mullo_epi32(fifteen_row_third_four, O_third_four), _mm_mullo_epi32(fifteen_row_fourth_four, O_last_four));<br>
+ __m128i dst15_temp = _mm_add_epi32(tmp1, tmp2);<br>
+<br>
+ tmp1 = _mm_add_epi32(_mm_mullo_epi32(seventeen_row_first_four, O_first_four), _mm_mullo_epi32(seventeen_row_second_four, O_second_four));<br>
+ tmp2 = _mm_add_epi32(_mm_mullo_epi32(seventeen_row_third_four, O_third_four), _mm_mullo_epi32(seventeen_row_fourth_four, O_last_four));<br>
+ __m128i dst17_temp = _mm_add_epi32(tmp1, tmp2);<br>
+<br>
+ tmp1 = _mm_add_epi32(_mm_mullo_epi32(nineteen_row_first_four, O_first_four), _mm_mullo_epi32(nineteen_row_second_four, O_second_four));<br>
+ tmp2 = _mm_add_epi32(_mm_mullo_epi32(nineteen_row_third_four, O_third_four), _mm_mullo_epi32(nineteen_row_fourth_four, O_last_four));<br>
+ __m128i dst19_temp = _mm_add_epi32(tmp1, tmp2);<br>
+<br>
+ tmp1 = _mm_add_epi32(_mm_mullo_epi32(twentyone_row_first_four, O_first_four), _mm_mullo_epi32(twentyone_row_second_four, O_second_four));<br>
+ tmp2 = _mm_add_epi32(_mm_mullo_epi32(twentyone_row_third_four, O_third_four), _mm_mullo_epi32(twentyone_row_fourth_four, O_last_four));<br>
+ __m128i dst21_temp = _mm_add_epi32(tmp1, tmp2);<br>
+<br>
+ tmp1 = _mm_add_epi32(_mm_mullo_epi32(twentythree_row_first_four, O_first_four), _mm_mullo_epi32(twentythree_row_second_four, O_second_four));<br>
+ tmp2 = _mm_add_epi32(_mm_mullo_epi32(twentythree_row_third_four, O_third_four), _mm_mullo_epi32(twentythree_row_fourth_four, O_last_four));<br>
+ __m128i dst23_temp = _mm_add_epi32(tmp1, tmp2);<br>
+<br>
+ tmp1 = _mm_add_epi32(_mm_mullo_epi32(twentyfive_row_first_four, O_first_four), _mm_mullo_epi32(twentyfive_row_second_four, O_second_four));<br>
+ tmp2 = _mm_add_epi32(_mm_mullo_epi32(twentyfive_row_third_four, O_third_four), _mm_mullo_epi32(twentyfive_row_fourth_four, O_last_four));<br>
+ __m128i dst25_temp = _mm_add_epi32(tmp1, tmp2);<br>
+<br>
+ tmp1 = _mm_add_epi32(_mm_mullo_epi32(twentyseven_row_first_four, O_first_four), _mm_mullo_epi32(twentyseven_row_second_four, O_second_four));<br>
+ tmp2 = _mm_add_epi32(_mm_mullo_epi32(twentyseven_row_third_four, O_third_four), _mm_mullo_epi32(twentyseven_row_fourth_four, O_last_four));<br>
+ __m128i dst27_temp = _mm_add_epi32(tmp1, tmp2);<br>
+<br>
+ tmp1 = _mm_add_epi32(_mm_mullo_epi32(twentynine_row_first_four, O_first_four), _mm_mullo_epi32(twentynine_row_second_four, O_second_four));<br>
+ tmp2 = _mm_add_epi32(_mm_mullo_epi32(twentynine_row_third_four, O_third_four), _mm_mullo_epi32(twentynine_row_fourth_four, O_last_four));<br>
+ __m128i dst29_temp = _mm_add_epi32(tmp1, tmp2);<br>
+<br>
+ tmp1 = _mm_add_epi32(_mm_mullo_epi32(thirtyone_row_first_four, O_first_four), _mm_mullo_epi32(thirtyone_row_second_four, O_second_four));<br>
+ tmp2 = _mm_add_epi32(_mm_mullo_epi32(thirtyone_row_third_four, O_third_four), _mm_mullo_epi32(thirtyone_row_fourth_four, O_last_four));<br>
+ __m128i dst31_temp = _mm_add_epi32(tmp1, tmp2);<br>
+<br>
+ dst[1 * line] = (_mm_cvtsi128_si32(_mm_hadd_epi32(_mm_hadd_epi32(dst1_temp, _mm_setzero_si128()), _mm_setzero_si128())) + add) >> shift;<br>
+ dst[3 * line] = (_mm_cvtsi128_si32(_mm_hadd_epi32(_mm_hadd_epi32(dst3_temp, _mm_setzero_si128()), _mm_setzero_si128())) + add) >> shift;<br>
+ dst[5 * line] = (_mm_cvtsi128_si32(_mm_hadd_epi32(_mm_hadd_epi32(dst5_temp, _mm_setzero_si128()), _mm_setzero_si128())) + add) >> shift;<br>
+ dst[7 * line] = (_mm_cvtsi128_si32(_mm_hadd_epi32(_mm_hadd_epi32(dst7_temp, _mm_setzero_si128()), _mm_setzero_si128())) + add) >> shift;<br>
+ dst[9 * line] = (_mm_cvtsi128_si32(_mm_hadd_epi32(_mm_hadd_epi32(dst9_temp, _mm_setzero_si128()), _mm_setzero_si128())) + add) >> shift;<br>
+ dst[11 * line] = (_mm_cvtsi128_si32(_mm_hadd_epi32(_mm_hadd_epi32(dst11_temp, _mm_setzero_si128()), _mm_setzero_si128())) + add) >> shift;<br>
+ dst[13 * line] = (_mm_cvtsi128_si32(_mm_hadd_epi32(_mm_hadd_epi32(dst13_temp, _mm_setzero_si128()), _mm_setzero_si128())) + add) >> shift;<br>
+ dst[15 * line] = (_mm_cvtsi128_si32(_mm_hadd_epi32(_mm_hadd_epi32(dst15_temp, _mm_setzero_si128()), _mm_setzero_si128())) + add) >> shift;<br>
+ dst[17 * line] = (_mm_cvtsi128_si32(_mm_hadd_epi32(_mm_hadd_epi32(dst17_temp, _mm_setzero_si128()), _mm_setzero_si128())) + add) >> shift;<br>
+ dst[19 * line] = (_mm_cvtsi128_si32(_mm_hadd_epi32(_mm_hadd_epi32(dst19_temp, _mm_setzero_si128()), _mm_setzero_si128())) + add) >> shift;<br>
+ dst[21 * line] = (_mm_cvtsi128_si32(_mm_hadd_epi32(_mm_hadd_epi32(dst21_temp, _mm_setzero_si128()), _mm_setzero_si128())) + add) >> shift;<br>
+ dst[23 * line] = (_mm_cvtsi128_si32(_mm_hadd_epi32(_mm_hadd_epi32(dst23_temp, _mm_setzero_si128()), _mm_setzero_si128())) + add) >> shift;<br>
+ dst[25 * line] = (_mm_cvtsi128_si32(_mm_hadd_epi32(_mm_hadd_epi32(dst25_temp, _mm_setzero_si128()), _mm_setzero_si128())) + add) >> shift;<br>
+ dst[27 * line] = (_mm_cvtsi128_si32(_mm_hadd_epi32(_mm_hadd_epi32(dst27_temp, _mm_setzero_si128()), _mm_setzero_si128())) + add) >> shift;<br>
+ dst[29 * line] = (_mm_cvtsi128_si32(_mm_hadd_epi32(_mm_hadd_epi32(dst29_temp, _mm_setzero_si128()), _mm_setzero_si128())) + add) >> shift;<br>
+ dst[31 * line] = (_mm_cvtsi128_si32(_mm_hadd_epi32(_mm_hadd_epi32(dst31_temp, _mm_setzero_si128()), _mm_setzero_si128())) + add) >> shift;<br>
<br>
src += 32;<br>
dst++;<br>
@@ -2202,15 +2232,9 @@<br>
partialButterfly32(block, coef, shift_1st, 32);<br>
partialButterfly32(coef, block, shift_2nd, 32);<br>
<br>
- /* TODO: inline cvt16to32 once it is intrinsic based */<br>
#define N (32)<br>
- for (int i = 0; i < N; i++)<br>
- {<br>
- for (int j = 0; j < N; j++)<br>
- {<br>
- dst[i * N + j] = block[i * N + j];<br>
- }<br>
- }<br>
+<br>
+ convert16to32(block, dst, N*N);<br>
<br>
#undef N<br>
}<br>
_______________________________________________<br>
x265-devel mailing list<br>
<a href="mailto:x265-devel@videolan.org">x265-devel@videolan.org</a><br>
<a href="https://mailman.videolan.org/listinfo/x265-devel" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a><br>
<br>
</blockquote></div><br><br clear="all"><div><br></div>-- <br>Steve Borho
</div></div>