[x265] [PATCH] pixel8.inc: replace calcRecons vector class function with intrinsic

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Tue Oct 8 10:38:38 CEST 2013


# HG changeset patch
# User Dnyaneshwar Gorade <dnyaneshwar at multicorewareinc.com>
# Date 1381221459 -19800
#      Tue Oct 08 14:07:39 2013 +0530
# Node ID 9d22be0b84ff2d5f3a8d4ee4d319a75f7f9c73a4
# Parent  d85c49059b6a30af455cf47ad38ea172c579cb9e
pixel8.inc: replace calcRecons vector class function with intrinsic.

diff -r d85c49059b6a -r 9d22be0b84ff source/common/vec/pixel8.inc
--- a/source/common/vec/pixel8.inc	Tue Oct 08 13:52:58 2013 +0530
+++ b/source/common/vec/pixel8.inc	Tue Oct 08 14:07:39 2013 +0530
@@ -190,26 +190,34 @@
     {
         for (int x = 0; x < blockSize; x += 16)
         {
-            Vec8s vresi, vpred, vres, vsum1, vsum2;
-            Vec16uc tmp;
+            __m128i resi, pred, sum1, sum2;
+            __m128i temp;
 
-            tmp.load(pPred + x);
+            temp = _mm_loadu_si128((__m128i const*)(pPred + x));
+            pred = _mm_unpacklo_epi8(temp, _mm_setzero_si128());         // interleave with zero extensions
 
-            vpred = extend_low(tmp);
-            vresi.load(pResi + x);
-            vsum1 = vpred + vresi;
-            vsum1 = min(255, max(vsum1, 0));
-            vsum1.store(pRecQt + x);
+            resi = _mm_loadu_si128((__m128i const*)(pResi + x));
+            sum1 = _mm_add_epi16(pred, resi);
 
-            vpred = extend_high(tmp);
-            vresi.load(pResi + x + 8);
-            vsum2 = vpred + vresi;
-            vsum2 = min(255, max(vsum2, 0));
-            vsum2.store(pRecQt + x + 8);
+            __m128i maxval = _mm_set1_epi16(0xff);                       // broadcast value 255(32-bit integer) to all elements of maxval
+            __m128i minval = _mm_set1_epi16(0x00);                       // broadcast value 0(32-bit integer) to all elements of minval
+            sum1 = _mm_min_epi16(maxval, _mm_max_epi16(sum1, minval));
+            _mm_storeu_si128((__m128i*)(pRecQt + x), sum1);
 
-            tmp = compress(vsum1, vsum2);
-            tmp.store(pReco + x);
-            tmp.store(pRecIPred + x);
+            pred = _mm_unpackhi_epi8(temp, _mm_setzero_si128());         // interleave with zero extensions
+            resi = _mm_loadu_si128((__m128i const*)(pResi + x + 8));
+            sum2 = _mm_add_epi16(pred, resi);
+
+            sum2 = _mm_min_epi16(maxval, _mm_max_epi16(sum2, minval));
+            _mm_storeu_si128((__m128i*)(pRecQt + x + 8), sum2);
+
+            __m128i mask = _mm_set1_epi32(0x00FF00FF);                   // mask for low bytes
+            __m128i low_mask  = _mm_and_si128(sum1, mask);               // bytes of low
+            __m128i high_mask = _mm_and_si128(sum2, mask);               // bytes of high
+            temp = _mm_packus_epi16(low_mask, high_mask);                // unsigned pack
+
+            _mm_storeu_si128((__m128i*)(pReco + x), temp);
+            _mm_storeu_si128((__m128i*)(pRecIPred + x), temp);
         }
 
         pPred     += stride;


More information about the x265-devel mailing list