[x265] [PATCH] pixel8.inc: replace calcRecons4 vector class function with intrinsic
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Tue Oct 8 10:05:03 CEST 2013
# HG changeset patch
# User Dnyaneshwar Gorade <dnyaneshwar at multicorewareinc.com>
# Date 1381219425 -19800
# Tue Oct 08 13:33:45 2013 +0530
# Node ID 65e5bc826d9dfe149cdfddbda131dcc6ebdb42cf
# Parent d71078917df01e92605158a13b45ab35ee7cfc1c
pixel8.inc: replace calcRecons4 vector class function with intrinsic.
diff -r d71078917df0 -r 65e5bc826d9d source/common/vec/pixel8.inc
--- a/source/common/vec/pixel8.inc Mon Oct 07 12:48:32 2013 +0530
+++ b/source/common/vec/pixel8.inc Tue Oct 08 13:33:45 2013 +0530
@@ -119,23 +119,27 @@
{
for (int y = 0; y < 4; y++)
{
- Vec8s vresi, vpred, vres, vsum;
- Vec16uc tmp;
+ __m128i resi, pred, sum;
+ __m128i temp;
- tmp = load_partial(const_int(4), pPred);
- vpred = extend_low(tmp);
+ temp = _mm_cvtsi32_si128(*(uint32_t*)pPred);
+ pred = _mm_unpacklo_epi8(temp, _mm_setzero_si128()); // interleave with 0
- vresi = load_partial(const_int(8), pResi);
- vsum = vpred + vresi;
+ resi = _mm_loadl_epi64((__m128i*)pResi);
+ sum = _mm_add_epi16(pred, resi);
- vsum = min(255, max(vsum, 0));
+ __m128i maxval = _mm_set1_epi16(0xff); // broadcast value 255(32-bit integer) to all elements of maxval
+ __m128i minval = _mm_set1_epi16(0x00); // broadcast value 0(32-bit integer) to all elements of minval
+ sum = _mm_min_epi16(maxval, _mm_max_epi16(sum, minval));
+ _mm_storel_epi64((__m128i*)pRecQt, sum);
- store_partial(const_int(8), pRecQt, vsum);
+ __m128i mask = _mm_set1_epi32(0x00FF00FF); // mask for low bytes
+ __m128i low_mask = _mm_and_si128(sum, mask); // bytes of low
+ __m128i high_mask = _mm_and_si128(sum, mask); // bytes of high
+ temp = _mm_packus_epi16(low_mask, high_mask); // unsigned pack
- tmp = compress(vsum, vsum);
-
- store_partial(const_int(4), pReco, tmp);
- store_partial(const_int(4), pRecIPred, tmp);
+ *(uint32_t*)pReco = _mm_cvtsi128_si32(temp);
+ *(uint32_t*)pRecIPred = _mm_cvtsi128_si32(temp);
pPred += stride;
pResi += stride;
More information about the x265-devel
mailing list