[x265] [PATCH] replace block_copy_p_s (short to pixel) vector class function with intrinsic

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Fri Oct 4 13:26:07 CEST 2013


# HG changeset patch
# User Dnyaneshwar
# Date 1380885916 -19800
#      Fri Oct 04 16:55:16 2013 +0530
# Node ID f4100c037a0d6f64d78a8a313e175f6c8445e30b
# Parent  69943bfd02a2feea711da586eb15c7ac77fa700d
replace block_copy_p_s (short to pixel) vector class function with intrinsic.
Performance measured is same as that of vector function.

diff -r 69943bfd02a2 -r f4100c037a0d source/common/vec/blockcopy-sse3.cpp
--- a/source/common/vec/blockcopy-sse3.cpp	Fri Oct 04 16:27:02 2013 +0530
+++ b/source/common/vec/blockcopy-sse3.cpp	Fri Oct 04 16:55:16 2013 +0530
@@ -106,10 +106,14 @@
         {
             for (int x = 0; x < bx; x += 16)
             {
-                Vec8us word0, word1;
-                word0.load_a(src + x);
-                word1.load_a(src + x + 8);
-                compress(word0, word1).store_a(dst + x);
+                __m128i word0 = _mm_load_si128((__m128i const*)(src + x));       // load block of 16 byte from src
+                __m128i word1 = _mm_load_si128((__m128i const*)(src + x + 8));
+
+                __m128i mask = _mm_set1_epi32(0x00FF00FF);                  // mask for low bytes
+                __m128i low_mask = _mm_and_si128(word0, mask);              // bytes of low
+                __m128i high_mask = _mm_and_si128(word1, mask);             // bytes of high
+                __m128i word01 = _mm_packus_epi16(low_mask, high_mask);     // unsigned pack
+                _mm_store_si128((__m128i*)&dst[x], word01);                 // store block into dst
             }
 
             src += sstride;


More information about the x265-devel mailing list