[x265] [PATCH] replace blockcopy_s_p (pixel to short) vector class function with intrinsic

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Fri Oct 4 13:41:17 CEST 2013


# HG changeset patch
# User Dnyaneshwar
# Date 1380886851 -19800
#      Fri Oct 04 17:10:51 2013 +0530
# Node ID f144605171a4bda4880472cd5b193678f0e7b80b
# Parent  f4100c037a0d6f64d78a8a313e175f6c8445e30b
replace blockcopy_s_p (pixel to short) vector class function with intrinsic.
Performance is same as that of vector class function.

diff -r f4100c037a0d -r f144605171a4 source/common/vec/blockcopy-sse3.cpp
--- a/source/common/vec/blockcopy-sse3.cpp	Fri Oct 04 16:55:16 2013 +0530
+++ b/source/common/vec/blockcopy-sse3.cpp	Fri Oct 04 17:10:51 2013 +0530
@@ -148,10 +148,11 @@
         {
             for (int x = 0; x < bx; x += 16)
             {
-                Vec16uc word;
-                word.load_a(src + x);
-                extend_low(word).store_a(dst + x);
-                extend_high(word).store_a(dst + x + 8);
+                __m128i word0 = _mm_load_si128((__m128i const*)(src + x));        // load block of 16 byte from src
+                __m128i word1 = _mm_unpacklo_epi8(word0, _mm_setzero_si128());    // interleave with zero extensions
+                _mm_store_si128((__m128i*)&dst[x], word1);                        // store block into dst
+                __m128i word2 = _mm_unpackhi_epi8(word0, _mm_setzero_si128());    // interleave with zero extensions
+                _mm_store_si128((__m128i*)&dst[x + 8], word2);                    // store block into dst
             }
 
             src += sstride;


More information about the x265-devel mailing list