[x265] [PATCH] Replace "pixeladd_ss" vector class function with intrinsic

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Fri Oct 4 15:12:40 CEST 2013


# HG changeset patch
# User Dnyaneshwar
# Date 1380892320 -19800
#      Fri Oct 04 18:42:00 2013 +0530
# Node ID ccd6887206bee669fba70430d4843371da58711d
# Parent  d8973c8d2a81e74127d3d799d9f300d58a37c466
Replace "pixeladd_ss" vector class function with intrinsic.
Performance measured is same as that of vector function.

diff -r d8973c8d2a81 -r ccd6887206be source/common/vec/blockcopy-sse3.cpp
--- a/source/common/vec/blockcopy-sse3.cpp	Fri Oct 04 18:03:32 2013 +0530
+++ b/source/common/vec/blockcopy-sse3.cpp	Fri Oct 04 18:42:00 2013 +0530
@@ -230,21 +230,24 @@
 
     if ( !(aligncheck & 15) && !(bx & 7))
     {
-        Vec8s zero(0), maxval((1 << X265_DEPTH) - 1);
+        __m128i maxval = _mm_set1_epi16((1 << X265_DEPTH) - 1);
+        __m128i zero = _mm_setzero_si128();
+
         // fast path, multiples of 8 pixel wide blocks
         for (int y = 0; y < by; y++)
         {
             for (int x = 0; x < bx; x += 8)
             {
-                Vec8s vecsrc0, vecsrc1, vecsum;
-                vecsrc0.load_a(src0 + x);
-                vecsrc1.load_a(src1 + x);
+                __m128i word0, word1, sum;
 
-                vecsum = add_saturated(vecsrc0, vecsrc1);
-                vecsum = max(vecsum, zero);
-                vecsum = min(vecsum, maxval);
+                word0 = _mm_load_si128((__m128i*)(src0 + x));    // load 16 bytes from src1
+                word1 = _mm_load_si128((__m128i*)(src1 + x));    // load 16 bytes from src2
 
-                vecsum.store(dst + x);
+                sum = _mm_adds_epi16(word0, word1);
+                sum = _mm_max_epi16(sum, zero);
+                sum = _mm_min_epi16(sum, maxval);
+
+                _mm_store_si128((__m128i*)&dst[x], sum);    // store block into dst
             }
 
             src0 += sstride0;
@@ -254,20 +257,23 @@
     }
     else if (!(bx & 7))
     {
-        Vec8s zero(0), maxval((1 << X265_DEPTH) - 1);
+        __m128i maxval = _mm_set1_epi16((1 << X265_DEPTH) - 1);
+        __m128i zero = _mm_setzero_si128();
+
         for (int y = 0; y < by; y++)
         {
             for (int x = 0; x < bx; x += 8)
             {
-                Vec8s vecsrc0, vecsrc1, vecsum;
-                vecsrc0.load(src0 + x);
-                vecsrc1.load(src1 + x);
+                __m128i word0, word1, sum;
 
-                vecsum = add_saturated(vecsrc0, vecsrc1);
-                vecsum = max(vecsum, zero);
-                vecsum = min(vecsum, maxval);
+                word0 = _mm_load_si128((__m128i*)(src0 + x));    // load 16 bytes from src1
+                word1 = _mm_load_si128((__m128i*)(src1 + x));    // load 16 bytes from src2
 
-                vecsum.store(dst + x);
+                sum = _mm_adds_epi16(word0, word1);
+                sum = _mm_max_epi16(sum, zero);
+                sum = _mm_min_epi16(sum, maxval);
+
+                _mm_store_si128((__m128i*)&dst[x], sum);    // store block into dst
             }
 
             src0 += sstride0;


More information about the x265-devel mailing list