[x265] [PATCH] replace "pixelsub_sp" vector class function with intrinsic

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Fri Oct 4 14:34:24 CEST 2013


# HG changeset patch
# User Dnyaneshwar
# Date 1380890012 -19800
#      Fri Oct 04 18:03:32 2013 +0530
# Node ID d8973c8d2a81e74127d3d799d9f300d58a37c466
# Parent  f144605171a4bda4880472cd5b193678f0e7b80b
replace "pixelsub_sp" vector class function with intrinsic.
Performance is same as that of vector function.

diff -r f144605171a4 -r d8973c8d2a81 source/common/vec/blockcopy-sse3.cpp
--- a/source/common/vec/blockcopy-sse3.cpp	Fri Oct 04 17:10:51 2013 +0530
+++ b/source/common/vec/blockcopy-sse3.cpp	Fri Oct 04 18:03:32 2013 +0530
@@ -186,14 +186,20 @@
         {
             for (int x = 0; x < bx; x += 16)
             {
-                Vec16uc word0, word1;
-                Vec8s word3, word4;
-                word0.load_a(src0 + x);
-                word1.load_a(src1 + x);
-                word3 = extend_low(word0) - extend_low(word1);
-                word4 = extend_high(word0) - extend_high(word1);
-                word3.store_a(dst + x);
-                word4.store_a(dst + x + 8);
+                __m128i word0, word1;
+                __m128i word3, word4;
+                __m128i mask = _mm_setzero_si128();
+
+                word0 = _mm_load_si128((__m128i const*)(src0 + x));    // load 16 bytes from src1
+                word1 = _mm_load_si128((__m128i const*)(src1 + x));    // load 16 bytes from src2
+
+                word3 = _mm_unpacklo_epi8(word0, mask);    // interleave with zero extensions
+                word4 = _mm_unpacklo_epi8(word1, mask);
+                _mm_store_si128((__m128i*)&dst[x], _mm_subs_epi16(word3, word4));    // store block into dst
+
+                word3 = _mm_unpackhi_epi8(word0, mask);    // interleave with zero extensions
+                word4 = _mm_unpackhi_epi8(word1, mask);
+                _mm_store_si128((__m128i*)&dst[x + 8], _mm_subs_epi16(word3, word4));    // store block into dst
             }
 
             src0 += sstride0;


More information about the x265-devel mailing list