[x265] [PATCH] Replace "pixeladd_ss" vector class function with intrinsic
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Fri Oct 4 15:12:40 CEST 2013
# HG changeset patch
# User Dnyaneshwar
# Date 1380892320 -19800
# Fri Oct 04 18:42:00 2013 +0530
# Node ID ccd6887206bee669fba70430d4843371da58711d
# Parent d8973c8d2a81e74127d3d799d9f300d58a37c466
Replace "pixeladd_ss" vector class function with intrinsic.
Performance measured is same as that of vector function.
diff -r d8973c8d2a81 -r ccd6887206be source/common/vec/blockcopy-sse3.cpp
--- a/source/common/vec/blockcopy-sse3.cpp Fri Oct 04 18:03:32 2013 +0530
+++ b/source/common/vec/blockcopy-sse3.cpp Fri Oct 04 18:42:00 2013 +0530
@@ -230,21 +230,24 @@
if ( !(aligncheck & 15) && !(bx & 7))
{
- Vec8s zero(0), maxval((1 << X265_DEPTH) - 1);
+ __m128i maxval = _mm_set1_epi16((1 << X265_DEPTH) - 1);
+ __m128i zero = _mm_setzero_si128();
+
// fast path, multiples of 8 pixel wide blocks
for (int y = 0; y < by; y++)
{
for (int x = 0; x < bx; x += 8)
{
- Vec8s vecsrc0, vecsrc1, vecsum;
- vecsrc0.load_a(src0 + x);
- vecsrc1.load_a(src1 + x);
+ __m128i word0, word1, sum;
- vecsum = add_saturated(vecsrc0, vecsrc1);
- vecsum = max(vecsum, zero);
- vecsum = min(vecsum, maxval);
+ word0 = _mm_load_si128((__m128i*)(src0 + x)); // load 16 bytes from src1
+ word1 = _mm_load_si128((__m128i*)(src1 + x)); // load 16 bytes from src2
- vecsum.store(dst + x);
+ sum = _mm_adds_epi16(word0, word1);
+ sum = _mm_max_epi16(sum, zero);
+ sum = _mm_min_epi16(sum, maxval);
+
+ _mm_store_si128((__m128i*)&dst[x], sum); // store block into dst
}
src0 += sstride0;
@@ -254,20 +257,23 @@
}
else if (!(bx & 7))
{
- Vec8s zero(0), maxval((1 << X265_DEPTH) - 1);
+ __m128i maxval = _mm_set1_epi16((1 << X265_DEPTH) - 1);
+ __m128i zero = _mm_setzero_si128();
+
for (int y = 0; y < by; y++)
{
for (int x = 0; x < bx; x += 8)
{
- Vec8s vecsrc0, vecsrc1, vecsum;
- vecsrc0.load(src0 + x);
- vecsrc1.load(src1 + x);
+ __m128i word0, word1, sum;
- vecsum = add_saturated(vecsrc0, vecsrc1);
- vecsum = max(vecsum, zero);
- vecsum = min(vecsum, maxval);
+ word0 = _mm_load_si128((__m128i*)(src0 + x)); // load 16 bytes from src1
+ word1 = _mm_load_si128((__m128i*)(src1 + x)); // load 16 bytes from src2
- vecsum.store(dst + x);
+ sum = _mm_adds_epi16(word0, word1);
+ sum = _mm_max_epi16(sum, zero);
+ sum = _mm_min_epi16(sum, maxval);
+
+ _mm_store_si128((__m128i*)&dst[x], sum); // store block into dst
}
src0 += sstride0;
More information about the x265-devel
mailing list