[x265] [PATCH] blockcopy-sse3.cpp: Replace pixeladd_pp vector class function with intrinsic
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Thu Oct 10 11:51:41 CEST 2013
# HG changeset patch
# User Dnyaneshwar Gorade <dnyaneshwar at multicorewareinc.com>
# Date 1381398657 -19800
# Thu Oct 10 15:20:57 2013 +0530
# Node ID 12d098e5d907249d2f450ddc329cbeede88f9e1c
# Parent a79ecf3a787577a2e557659c7a8d226d7d41ce00
blockcopy-sse3.cpp: Replace pixeladd_pp vector class function with intrinsic.
diff -r a79ecf3a7875 -r 12d098e5d907 source/common/vec/blockcopy-sse3.cpp
--- a/source/common/vec/blockcopy-sse3.cpp Thu Oct 10 12:29:41 2013 +0530
+++ b/source/common/vec/blockcopy-sse3.cpp Thu Oct 10 15:20:57 2013 +0530
@@ -305,25 +305,25 @@
void pixeladd_pp(int bx, int by, pixel *dst, intptr_t dstride, pixel *src0, pixel *src1, intptr_t sstride0, intptr_t sstride1)
{
size_t aligncheck = (size_t)dst | (size_t)src0 | bx | sstride0 | sstride1 | dstride;
-
+ int i = 1;
if (!(aligncheck & 15))
{
- Vec16uc zero(0), maxval((1 << X265_DEPTH) - 1);
+ __m128i maxval = _mm_set1_epi8((i << X265_DEPTH) - 1);
+ __m128i zero = _mm_setzero_si128();
+
// fast path, multiples of 16 pixel wide blocks
for (int y = 0; y < by; y++)
{
for (int x = 0; x < bx; x += 16)
{
- Vec16uc vecsrc0, vecsrc1, vecsum;
- vecsrc0.load_a(src0 + x);
- vecsrc1.load_a(src1 + x);
- vecsum = add_saturated(vecsrc0, vecsrc1);
- vecsum = max(vecsum, zero);
- vecsum = min(vecsum, maxval);
-
- vecsum.store(dst + x);
+ __m128i word0, word1, sum;
+ word0 = _mm_load_si128((__m128i const*)(src0 + x));
+ word1 = _mm_load_si128((__m128i const*)(src1 + x));
+ sum = _mm_adds_epu8(word0, word1);
+ sum = _mm_max_epu8(sum, zero);
+ sum = _mm_min_epu8(sum, maxval);
+ _mm_storeu_si128((__m128i*)&dst[x], sum);
}
-
src0 += sstride0;
src1 += sstride1;
dst += dstride;
@@ -331,22 +331,22 @@
}
else if (!(bx & 15))
{
- Vec16uc zero(0), maxval((1 << X265_DEPTH) - 1);
+ __m128i maxval = _mm_set1_epi8((i << X265_DEPTH) - 1);
+ __m128i zero = _mm_setzero_si128();
+
// fast path, multiples of 16 pixel wide blocks but pointers/strides require unaligned accesses
for (int y = 0; y < by; y++)
{
for (int x = 0; x < bx; x += 16)
{
- Vec16uc vecsrc0, vecsrc1, vecsum;
- vecsrc0.load(src0 + x);
- vecsrc1.load(src1 + x);
- vecsum = add_saturated(vecsrc0, vecsrc1);
- vecsum = max(vecsum, zero);
- vecsum = min(vecsum, maxval);
-
- vecsum.store(dst + x);
+ __m128i word0, word1, sum;
+ word0 = _mm_load_si128((__m128i const*)(src0 + x));
+ word1 = _mm_load_si128((__m128i const*)(src1 + x));
+ sum = _mm_adds_epu8(word0, word1);
+ sum = _mm_max_epu8(sum, zero);
+ sum = _mm_min_epu8(sum, maxval);
+ _mm_storeu_si128((__m128i*)&dst[x], sum);
}
-
src0 += sstride0;
src1 += sstride1;
dst += dstride;
More information about the x265-devel
mailing list