[x265] [PATCH] dct: Replaced inversedst vector class function to intrinsic

Fri Oct 11 15:00:52 CEST 2013

# HG changeset patch
# User Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
# Date 1381496411 -19800
#      Fri Oct 11 18:30:11 2013 +0530
# Node ID c90ce1abb18bff0e6ac49b7a39da1e7d86836837
# Parent  f1e462de1341b222ceb4295129426b93b3afdeca
dct: Replaced inversedst vector class function to intrinsic

diff -r f1e462de1341 -r c90ce1abb18b source/common/vec/dct-sse3.cpp

--- a/source/common/vec/dct-sse3.cpp	Fri Oct 11 16:52:58 2013 +0530
+++ b/source/common/vec/dct-sse3.cpp	Fri Oct 11 18:30:11 2013 +0530
@@ -2216,6 +2216,7 @@
     }
 }
 
+
 void dct32(short *src, int *dst, intptr_t stride)
 {
     const int shift_1st = 4;
@@ -2244,32 +2245,42 @@
 {
     int rnd_factor = 1 << (shift - 1);
 
-    Vec8s tmp0, tmp1;
+    __m128i tmp0, tmp1;
 
-    tmp0.load_a(tmp);
-    tmp1.load_a(tmp + 8);
+    tmp0 = _mm_load_si128((__m128i*)tmp);
+    __m128i sign = _mm_srai_epi16(tmp0, 15);
+    __m128i c0 = _mm_unpacklo_epi16(tmp0, sign);
+    __m128i c1 = _mm_unpackhi_epi16(tmp0, sign);
+    tmp1 = _mm_load_si128((__m128i*)(tmp + 8));
+    sign = _mm_srai_epi16(tmp1, 15);
+    __m128i c2 = _mm_unpacklo_epi16(tmp1, sign);
+    __m128i c3 = _mm_unpackhi_epi16(tmp1, sign);
 
-    Vec4i c0 = extend_low(tmp0);
-    Vec4i c1 = extend_high(tmp0);
-    Vec4i c2 = extend_low(tmp1);
-    Vec4i c3 = extend_high(tmp1);
+    __m128i c0_total = _mm_add_epi32(c0, c2);
+    __m128i c1_total = _mm_add_epi32(c2, c3);
+    __m128i c2_total = _mm_sub_epi32(c0, c3);
+    __m128i c3_total = _mm_mullo_epi32(_mm_set1_epi32(74), c1);
 
-    Vec4i c0_total = c0 + c2;
-    Vec4i c1_total = c2 + c3;
-    Vec4i c2_total = c0 - c3;
-    Vec4i c3_total = 74 * c1;
+    __m128i c4 = _mm_add_epi32(_mm_sub_epi32(c0, c2), c3);
 
-    Vec4i c4 = (c0 - c2 + c3);
+    __m128i c0_final = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_set1_epi32(29), c0_total), _mm_mullo_epi32(_mm_set1_epi32(55), c1_total)), c3_total), _mm_set1_epi32(rnd_factor)), shift);
+    __m128i c1_final = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(_mm_sub_epi32(_mm_mullo_epi32(_mm_set1_epi32(55), c2_total), _mm_mullo_epi32(_mm_set1_epi32(29), c1_total)), c3_total), _mm_set1_epi32(rnd_factor)), shift);
+    __m128i c2_final = _mm_srai_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_set1_epi32(74), c4), _mm_set1_epi32(rnd_factor)), shift);
+    __m128i c3_final = _mm_srai_epi32(_mm_add_epi32(_mm_sub_epi32(_mm_add_epi32(_mm_mullo_epi32(_mm_set1_epi32(55), c0_total), _mm_mullo_epi32(_mm_set1_epi32(29), c2_total)), c3_total), _mm_set1_epi32(rnd_factor)), shift);
 
-    Vec4i c0_final = (29 * c0_total + 55 * c1_total + c3_total + rnd_factor) >> shift;
-    Vec4i c1_final = (55 * c2_total - 29 * c1_total + c3_total + rnd_factor) >> shift;
-    Vec4i c2_final = (74 * c4 + rnd_factor) >> shift;
-    Vec4i c3_final = (55 * c0_total + 29 * c2_total - c3_total + rnd_factor) >> shift;
+    __m128i half0 = _mm_packs_epi32(c0_final, c1_final);
+    __m128i half1 = _mm_packs_epi32(c2_final, c3_final);
 
-    Vec8s half0 = compress_saturated(c0_final, c1_final);
-    Vec8s half1 = compress_saturated(c2_final, c3_final);
-    blend8s<0, 4, 8, 12, 1, 5, 9, 13>(half0, half1).store_a(block);
-    blend8s<2, 6, 10, 14, 3, 7, 11, 15>(half0, half1).store_a(block + 8);
+    tmp0 = _mm_unpacklo_epi64(half0, _mm_setzero_si128());
+    tmp1 = _mm_unpackhi_epi64(half0, _mm_setzero_si128());
+    half0 = _mm_unpacklo_epi16(tmp0, tmp1);
+
+    tmp0 = _mm_unpacklo_epi64(half1, _mm_setzero_si128());
+    tmp1 = _mm_unpackhi_epi64(half1, _mm_setzero_si128());
+    half1 = _mm_unpacklo_epi16(tmp0, tmp1);
+
+    _mm_store_si128((__m128i*)(block), _mm_unpacklo_epi32(half0, half1));
+    _mm_store_si128((__m128i*)(block + 8), _mm_unpackhi_epi32(half0, half1));
 }
 
 void idst4(int *src, short *dst, intptr_t stride)