[x265] [PATCH] replace block_copy_p_s (short to pixel) vector class function with intrinsic
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Fri Oct 4 13:26:07 CEST 2013
# HG changeset patch
# User Dnyaneshwar
# Date 1380885916 -19800
# Fri Oct 04 16:55:16 2013 +0530
# Node ID f4100c037a0d6f64d78a8a313e175f6c8445e30b
# Parent 69943bfd02a2feea711da586eb15c7ac77fa700d
replace block_copy_p_s (short to pixel) vector class function with intrinsic.
Performance measured is same as that of vector function.
diff -r 69943bfd02a2 -r f4100c037a0d source/common/vec/blockcopy-sse3.cpp
--- a/source/common/vec/blockcopy-sse3.cpp Fri Oct 04 16:27:02 2013 +0530
+++ b/source/common/vec/blockcopy-sse3.cpp Fri Oct 04 16:55:16 2013 +0530
@@ -106,10 +106,14 @@
{
for (int x = 0; x < bx; x += 16)
{
- Vec8us word0, word1;
- word0.load_a(src + x);
- word1.load_a(src + x + 8);
- compress(word0, word1).store_a(dst + x);
+ __m128i word0 = _mm_load_si128((__m128i const*)(src + x)); // load block of 16 byte from src
+ __m128i word1 = _mm_load_si128((__m128i const*)(src + x + 8));
+
+ __m128i mask = _mm_set1_epi32(0x00FF00FF); // mask for low bytes
+ __m128i low_mask = _mm_and_si128(word0, mask); // bytes of low
+ __m128i high_mask = _mm_and_si128(word1, mask); // bytes of high
+ __m128i word01 = _mm_packus_epi16(low_mask, high_mask); // unsigned pack
+ _mm_store_si128((__m128i*)&dst[x], word01); // store block into dst
}
src += sstride;
More information about the x265-devel
mailing list