[x265] [PATCH] pixel8.inc: replace calcRecons8 vector class function with intrinsic

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Tue Oct 8 10:23:30 CEST 2013


# HG changeset patch
# User Dnyaneshwar Gorade <dnyaneshwar at multicorewareinc.com>
# Date 1381220578 -19800
#      Tue Oct 08 13:52:58 2013 +0530
# Node ID d85c49059b6a30af455cf47ad38ea172c579cb9e
# Parent  65e5bc826d9dfe149cdfddbda131dcc6ebdb42cf
pixel8.inc: replace calcRecons8 vector class function with intrinsic.

diff -r 65e5bc826d9d -r d85c49059b6a source/common/vec/pixel8.inc
--- a/source/common/vec/pixel8.inc	Tue Oct 08 13:33:45 2013 +0530
+++ b/source/common/vec/pixel8.inc	Tue Oct 08 13:52:58 2013 +0530
@@ -153,23 +153,27 @@
 {
     for (int y = 0; y < 8; y++)
     {
-        Vec8s vresi, vpred, vres, vsum;
-        Vec16uc tmp;
+        __m128i resi, pred, sum;
+        __m128i temp;
 
-        tmp.load(pPred);
-        vpred = extend_low(tmp);
+        temp = _mm_loadu_si128((__m128i const*)pPred);
+        pred = _mm_unpacklo_epi8(temp, _mm_setzero_si128());        // interleave with zero extensions
 
-        vresi.load(pResi);
-        vsum = vpred + vresi;
+        resi = _mm_loadu_si128((__m128i const*)pResi);
+        sum = _mm_add_epi16(pred, resi);
 
-        vsum = min(255, max(vsum, 0));
+        __m128i maxval = _mm_set1_epi16(0xff);                      // broadcast value 255(32-bit integer) to all elements of maxval
+        __m128i minval = _mm_set1_epi16(0x00);                      // broadcast value 0(32-bit integer) to all elements of minval
+        sum = _mm_min_epi16(maxval, _mm_max_epi16(sum, minval));
+        _mm_storeu_si128((__m128i*)pRecQt, sum);
 
-        vsum.store(pRecQt);
+        __m128i mask = _mm_set1_epi32(0x00FF00FF);                  // mask for low bytes
+        __m128i low_mask  = _mm_and_si128(sum, mask);               // bytes of low
+        __m128i high_mask = _mm_and_si128(sum, mask);               // bytes of high
+        temp = _mm_packus_epi16(low_mask, high_mask);               // unsigned pack
 
-        tmp = compress(vsum, vsum);
-
-        store_partial(const_int(8), pReco, tmp);
-        store_partial(const_int(8), pRecIPred, tmp);
+        _mm_storel_epi64((__m128i*)pReco, temp);
+        _mm_storel_epi64((__m128i*)pRecIPred, temp);
 
         pPred     += stride;
         pResi     += stride;


More information about the x265-devel mailing list