[x265] [PATCH] sse_pp32: Improved performance for all versions above 1x

praveen at multicorewareinc.com praveen at multicorewareinc.com
Wed Aug 7 06:42:57 CEST 2013


# HG changeset patch
# User praveentiwari
# Date 1375850566 -19800
# Node ID 82efd9fd4b9f63f59de1216e67e51d5294753f9b
# Parent  2ae1a03495926cebc82947dceb206e9a1510baae
sse_pp32: Improved performance for all versions above 1x

diff -r 2ae1a0349592 -r 82efd9fd4b9f source/common/vec/sse.inc
--- a/source/common/vec/sse.inc	Wed Aug 07 09:53:55 2013 +0530
+++ b/source/common/vec/sse.inc	Wed Aug 07 10:12:46 2013 +0530
@@ -178,40 +178,60 @@
     return horizontal_add(sum_low) + horizontal_add(sum_high);
 }
 
+#if INSTRSET >= X265_CPU_LEVEL_SSE41
 template<int ly>
 int sse_pp32(pixel* Org, intptr_t strideOrg, pixel* Cur, intptr_t strideCur)
 {
     int rows = ly;
-    Vec16uc m1, n1;
+    __m128i sum = _mm_set1_epi32(0);
 
-    Vec8us diff_low(0), diff_high(0);
-    Vec4i sum_low(0), sum_high(0);
     for (; rows != 0; rows--)
     {
-        m1.load(Org);
-        n1.load(Cur);
-        diff_low = extend_low(m1) - extend_low(n1);
-        diff_high = extend_high(m1) - extend_high(n1);
-        diff_low = diff_low * diff_low;
-        diff_high = diff_high * diff_high;
-        sum_low += (extend_low(diff_low) + extend_low(diff_high));
-        sum_high += (extend_high(diff_low) + extend_high(diff_high));
+        __m128i m1 = _mm_loadu_si128((__m128i const*)(Org));
+        __m128i n1 = _mm_loadu_si128((__m128i const*)(Cur));
 
-        m1.load(Org + 16);
-        n1.load(Cur + 16);
-        diff_low = extend_low(m1) - extend_low(n1);
-        diff_high = extend_high(m1) - extend_high(n1);
-        diff_low = diff_low * diff_low;
-        diff_high = diff_high * diff_high;
-        sum_low += (extend_low(diff_low) + extend_low(diff_high));
-        sum_high += (extend_high(diff_low) + extend_high(diff_high));
+        __m128i m1lo = _mm_cvtepu8_epi16(m1);
+        __m128i m1hi = _mm_srli_si128(m1, 8);
+        m1hi = _mm_cvtepu8_epi16(m1hi);
+
+        __m128i n1lo = _mm_cvtepu8_epi16(n1);
+        __m128i n1hi = _mm_srli_si128(n1, 8);
+        n1hi = _mm_cvtepu8_epi16(n1hi);
+
+        __m128i diff = _mm_sub_epi16(m1lo, n1lo);
+        sum = _mm_add_epi32(sum, _mm_madd_epi16(diff, diff));
+
+        diff = _mm_sub_epi16(m1hi, n1hi);
+        sum = _mm_add_epi32(sum, _mm_madd_epi16(diff, diff));
+
+        m1 = _mm_loadu_si128((__m128i const*)(Org + 16));
+        n1 = _mm_loadu_si128((__m128i const*)(Cur + 16));
+
+        m1lo = _mm_cvtepu8_epi16(m1);
+        m1hi = _mm_srli_si128(m1, 8);
+        m1hi = _mm_cvtepu8_epi16(m1hi);
+
+        n1lo = _mm_cvtepu8_epi16(n1);
+        n1hi = _mm_srli_si128(n1, 8);
+        n1hi = _mm_cvtepu8_epi16(n1hi);
+
+        diff = _mm_sub_epi16(m1lo, n1lo);
+        sum = _mm_add_epi32(sum, _mm_madd_epi16(diff, diff));
+
+        diff = _mm_sub_epi16(m1hi, n1hi);
+        sum = _mm_add_epi32(sum, _mm_madd_epi16(diff, diff));
+
         Org += strideOrg;
         Cur += strideCur;
     }
 
-    return horizontal_add(sum_low) + horizontal_add(sum_high);
+    sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
+    sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4));
+    return _mm_cvtsi128_si32(sum);
 }
 
+#endif /* if INSTRSET >= X265_CPU_LEVEL_SSE41 */
+
 template<int ly>
 int sse_pp48(pixel* Org, intptr_t strideOrg, pixel* Cur, intptr_t strideCur)
 {


More information about the x265-devel mailing list