[x265] [PATCH] ipfilter-ssse3.cpp: Replace filterConvertShortToPel vector class function with intrinsic
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Thu Oct 10 09:33:45 CEST 2013
# HG changeset patch
# User Dnyaneshwar Gorade <dnyaneshwar at multicorewareinc.com>
# Date 1381390085 -19800
# Thu Oct 10 12:58:05 2013 +0530
# Node ID ad1822b8e451ec9de4a8d679e9dee5d0c2b8fa8d
# Parent 49230a47306bd4a4b7c696a7e723d664755a92d7
ipfilter-ssse3.cpp: Replace filterConvertShortToPel vector class function with intrinsic.
diff -r 49230a47306b -r ad1822b8e451 source/common/vec/ipfilter-ssse3.cpp
--- a/source/common/vec/ipfilter-ssse3.cpp Thu Oct 10 11:59:16 2013 +0530
+++ b/source/common/vec/ipfilter-ssse3.cpp Thu Oct 10 12:58:05 2013 +0530
@@ -199,50 +199,102 @@
}
}
-void filterConvertShortToPel(short *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height)
+void filterConvertShortToPel(short *source, intptr_t sourceStride, pixel *dest, intptr_t destStride, int width, int height)
{
- short* srcOrg = src;
- pixel* dstOrg = dst;
+ short* src = source;
+ pixel* dst = dest;
int shift = IF_INTERNAL_PREC - X265_DEPTH;
short offset = IF_INTERNAL_OFFS;
+ offset += shift ? (1 << (shift - 1)) : 0;
+ short maxval = (1 << X265_DEPTH) - 1;
+ int row, col;
- offset += shift ? (1 << (shift - 1)) : 0;
- short maxVal = (1 << X265_DEPTH) - 1;
- Vec8s minVal(0);
- int row, col;
- Vec8s src_c, val_c, val_zero(0);
- Vec16uc val_uc;
+ __m128i minval = _mm_setzero_si128();
+ __m128i zeroval = _mm_setzero_si128();
+ __m128i val1, val2, val3;
+
for (row = 0; row < height; row++)
{
for (col = 0; col < width - 7; col += 8)
{
- src_c.load(src + col);
- val_c = add_saturated(src_c, offset) >> shift;
- val_c = max(val_c, minVal);
- val_c = min(val_c, maxVal);
- val_uc = compress(val_c, val_zero);
- val_uc.store_partial(8, dst + col);
+ val1 = _mm_loadu_si128((__m128i const*)(source + col));
+ val2 = _mm_sra_epi16(_mm_adds_epi16(val1, _mm_set1_epi16(offset)), _mm_cvtsi32_si128(shift));
+ val2 = _mm_max_epi16(val2, minval);
+ val2 = _mm_min_epi16(val2, _mm_set1_epi16(maxval));
+
+ __m128i mask = _mm_set1_epi32(0x00FF00FF); // mask for low bytes
+ __m128i lowm = _mm_and_si128(val2, mask); // bytes of low
+ __m128i highm = _mm_and_si128(zeroval, mask); // bytes of high
+ val3 = _mm_packus_epi16(lowm, highm); // unsigned pack
+
+ union
+ {
+ int8_t c[16];
+ int64_t q[2];
+ } u;
+ _mm_storeu_si128((__m128i*)u.c, val3);
+ *(int64_t*)(dest + col) = u.q[0];
}
-
- src += srcStride;
- dst += dstStride;
+ source += sourceStride;
+ dest += destStride;
}
if (width % 8 != 0)
{
- src = srcOrg;
- dst = dstOrg;
+ source = src;
+ dest = dst;
col = width - (width % 8);
for (row = 0; row < height; row++)
{
- src_c.load(src + col);
- val_c = add_saturated(src_c, offset) >> shift;
- val_c = max(val_c, minVal);
- val_c = min(val_c, maxVal);
- val_uc = compress(val_c, val_zero);
- val_uc.store_partial(width - col, dst + col);
- src += srcStride;
- dst += dstStride;
+ val1 = _mm_loadu_si128((__m128i const*)(source + col));
+ val2 = _mm_sra_epi16(_mm_adds_epi16(val1, _mm_set1_epi16(offset)), _mm_cvtsi32_si128(shift));
+ val2 = _mm_max_epi16(val2, minval);
+ val2 = _mm_min_epi16(val2, _mm_set1_epi16(maxval));
+
+ __m128i mask = _mm_set1_epi32(0x00FF00FF); // mask for low bytes
+ __m128i lowm = _mm_and_si128(val2, mask); // bytes of low
+ __m128i highm = _mm_and_si128(zeroval, mask); // bytes of high
+ val3 = _mm_packus_epi16(lowm, highm); // unsigned pack
+
+ int n = width - col;
+ if (n >= 16)
+ {
+ _mm_storeu_si128((__m128i*)(dest + col), val3);
+ }
+ else if (n <= 0) ; // do nothing if value of is n less than 0
+ else
+ {
+ union
+ {
+ int8_t c[16];
+ int16_t s[8];
+ int32_t i[4];
+ int64_t q[2];
+ } u;
+ _mm_storeu_si128((__m128i*)u.c, val3);
+ int j = 0;
+ if (n & 8) // n == (8,9,10,11,12,13,14,15)
+ {
+ *(int64_t*)(dest + col) = u.q[0];
+ j += 8;
+ }
+ if (n & 4) // n == (4,5,6,7,12,13,14,15)
+ {
+ ((int32_t*)(dest + col))[j/4] = u.i[j/4];
+ j += 4;
+ }
+ if (n & 2) // n == (2,3,6,7,10,11,14,15)
+ {
+ ((int16_t*)(dest + col))[j/2] = u.s[j/2];
+ j += 2;
+ }
+ if (n & 1) // n == (1,3,5,7,9,11,13,15)
+ {
+ ((int8_t*)(dest + col))[j] = u.c[j];
+ }
+ }
+ source += sourceStride;
+ dest += destStride;
}
}
}
More information about the x265-devel
mailing list