[x265] [PATCH] replace "pixelsub_sp" vector class function with intrinsic
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Fri Oct 4 14:34:24 CEST 2013
# HG changeset patch
# User Dnyaneshwar
# Date 1380890012 -19800
# Fri Oct 04 18:03:32 2013 +0530
# Node ID d8973c8d2a81e74127d3d799d9f300d58a37c466
# Parent f144605171a4bda4880472cd5b193678f0e7b80b
replace "pixelsub_sp" vector class function with intrinsic.
Performance is same as that of vector function.
diff -r f144605171a4 -r d8973c8d2a81 source/common/vec/blockcopy-sse3.cpp
--- a/source/common/vec/blockcopy-sse3.cpp Fri Oct 04 17:10:51 2013 +0530
+++ b/source/common/vec/blockcopy-sse3.cpp Fri Oct 04 18:03:32 2013 +0530
@@ -186,14 +186,20 @@
{
for (int x = 0; x < bx; x += 16)
{
- Vec16uc word0, word1;
- Vec8s word3, word4;
- word0.load_a(src0 + x);
- word1.load_a(src1 + x);
- word3 = extend_low(word0) - extend_low(word1);
- word4 = extend_high(word0) - extend_high(word1);
- word3.store_a(dst + x);
- word4.store_a(dst + x + 8);
+ __m128i word0, word1;
+ __m128i word3, word4;
+ __m128i mask = _mm_setzero_si128();
+
+ word0 = _mm_load_si128((__m128i const*)(src0 + x)); // load 16 bytes from src1
+ word1 = _mm_load_si128((__m128i const*)(src1 + x)); // load 16 bytes from src2
+
+ word3 = _mm_unpacklo_epi8(word0, mask); // interleave with zero extensions
+ word4 = _mm_unpacklo_epi8(word1, mask);
+ _mm_store_si128((__m128i*)&dst[x], _mm_subs_epi16(word3, word4)); // store block into dst
+
+ word3 = _mm_unpackhi_epi8(word0, mask); // interleave with zero extensions
+ word4 = _mm_unpackhi_epi8(word1, mask);
+ _mm_store_si128((__m128i*)&dst[x + 8], _mm_subs_epi16(word3, word4)); // store block into dst
}
src0 += sstride0;
More information about the x265-devel
mailing list