[x265] [PATCH] ipfilter-ssse3.cpp: Replace filterConvertPelToShort vector class function with intrinsic
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Thu Oct 10 08:30:20 CEST 2013
# HG changeset patch
# User Dnyaneshwar Gorade <dnyaneshwar at multicorewareinc.com>
# Date 1381386556 -19800
# Thu Oct 10 11:59:16 2013 +0530
# Node ID 49230a47306bd4a4b7c696a7e723d664755a92d7
# Parent 5dceef85c58cd001f4c65b4037085f921e748715
ipfilter-ssse3.cpp: Replace filterConvertPelToShort vector class function with intrinsic.
diff -r 5dceef85c58c -r 49230a47306b source/common/vec/ipfilter-ssse3.cpp
--- a/source/common/vec/ipfilter-ssse3.cpp Wed Oct 09 15:57:24 2013 -0500
+++ b/source/common/vec/ipfilter-ssse3.cpp Thu Oct 10 11:59:16 2013 +0530
@@ -129,42 +129,72 @@
}
}
-void filterConvertPelToShort(pixel *src, intptr_t srcStride, short *dst, intptr_t dstStride, int width, int height)
+void filterConvertPelToShort(pixel *source, intptr_t sourceStride, short *dest, intptr_t destStride, int width, int height)
{
- pixel* srcOrg = src;
- short* dstOrg = dst;
+ pixel* src = source;
+ short* dst = dest;
int shift = IF_INTERNAL_PREC - X265_DEPTH;
int row, col;
- Vec16uc src_v;
- Vec8s dst_v, val_v;
+
+ __m128i val1, val2, val3;
for (row = 0; row < height; row++)
{
for (col = 0; col < width - 7; col += 8)
{
- src_v.load(src + col);
- val_v = extend_low(src_v) << shift;
- dst_v = val_v - IF_INTERNAL_OFFS;
- dst_v.store(dst + col);
+ val1 = _mm_loadu_si128((__m128i const*)(source + col));
+ val2 = _mm_sll_epi16(_mm_unpacklo_epi8(val1, _mm_setzero_si128()), _mm_cvtsi32_si128(shift));
+ val3 = _mm_sub_epi16(val2, _mm_set1_epi16(IF_INTERNAL_OFFS));
+ _mm_storeu_si128((__m128i*)(dest + col), val3);
}
-
- src += srcStride;
- dst += dstStride;
+ source += sourceStride;
+ dest += destStride;
}
-
if (width % 8 != 0)
{
- src = srcOrg;
- dst = dstOrg;
+ source = src;
+ dest = dst;
col = width - (width % 8);
for (row = 0; row < height; row++)
{
- src_v.load(src + col);
- val_v = extend_low(src_v) << shift;
- dst_v = val_v - IF_INTERNAL_OFFS;
- dst_v.store_partial(width - col, dst + col);
- src += srcStride;
- dst += dstStride;
+ val1 = _mm_loadu_si128((__m128i const*)(source + col));
+ val2 = _mm_sll_epi16(_mm_unpacklo_epi8(val1, _mm_setzero_si128()), _mm_cvtsi32_si128(shift));
+ val3 = _mm_sub_epi16(val2, _mm_set1_epi16(IF_INTERNAL_OFFS));
+
+ int n = width - col;
+ if (n >= 8)
+ {
+ _mm_storeu_si128((__m128i*)(dest + col), val3);
+ }
+ else if (n <= 0) ; // do nothing if value of is n less than 0
+ else
+ {
+ union
+ {
+ int8_t c[16];
+ int16_t s[8];
+ int32_t i[4];
+ int64_t q[2];
+ } u;
+ _mm_storeu_si128((__m128i*)u.c, val3);
+ int j = 0;
+ if (n & 4) // n == (4,5,6,7)
+ {
+ *(int64_t*)(dest + col) = u.q[0];
+ j += 8;
+ }
+ if (n & 2) // n == (2,3,6,7)
+ {
+ ((int32_t*)(dest + col))[j/4] = u.i[j/4];
+ j += 4;
+ }
+ if (n & 1) // n == (1,3,5,7)
+ {
+ ((int16_t*)(dest + col))[j/2] = u.s[j/2];
+ }
+ }
+ source += sourceStride;
+ dest += destStride;
}
}
}
More information about the x265-devel
mailing list