[x265] [PATCH] sse_pp64: +1x over last commit

praveen at multicorewareinc.com praveen at multicorewareinc.com
Thu Aug 8 14:08:15 CEST 2013


# HG changeset patch
# User praveentiwari
# Date 1375963687 -19800
# Node ID 9a910be357aef42b9f17bc0be15397704982b408
# Parent  068c15b167f1f77a05e5310c12d7115db5f7e078
sse_pp64: +1x over last commit

diff -r 068c15b167f1 -r 9a910be357ae source/common/vec/sse.inc
--- a/source/common/vec/sse.inc	Thu Aug 08 17:29:58 2013 +0530
+++ b/source/common/vec/sse.inc	Thu Aug 08 17:38:07 2013 +0530
@@ -296,7 +296,8 @@
 int sse_pp64(pixel* Org, intptr_t strideOrg, pixel* Cur, intptr_t strideCur)
 {
     int rows = ly;
-    __m128i sum = _mm_set1_epi32(0);
+    __m128i sum = _mm_setzero_si128();
+    __m128i zero = _mm_setzero_si128();
 
     for (; rows != 0; rows--)
     {
@@ -304,12 +305,10 @@
         __m128i n1 = _mm_loadu_si128((__m128i const*)(Cur));
 
         __m128i m1lo = _mm_cvtepu8_epi16(m1);
-        __m128i m1hi = _mm_srli_si128(m1, 8);
-        m1hi = _mm_cvtepu8_epi16(m1hi);
+        __m128i m1hi = _mm_unpackhi_epi8(m1, zero);
 
         __m128i n1lo = _mm_cvtepu8_epi16(n1);
-        __m128i n1hi = _mm_srli_si128(n1, 8);
-        n1hi = _mm_cvtepu8_epi16(n1hi);
+        __m128i n1hi = _mm_unpackhi_epi8(n1, zero);
 
         __m128i diff = _mm_sub_epi16(m1lo, n1lo);
         sum = _mm_add_epi32(sum, _mm_madd_epi16(diff, diff));
@@ -321,12 +320,10 @@
         n1 = _mm_loadu_si128((__m128i const*)(Cur + 16));
 
         m1lo = _mm_cvtepu8_epi16(m1);
-        m1hi = _mm_srli_si128(m1, 8);
-        m1hi = _mm_cvtepu8_epi16(m1hi);
+        m1hi = _mm_unpackhi_epi8(m1, zero);
 
         n1lo = _mm_cvtepu8_epi16(n1);
-        n1hi = _mm_srli_si128(n1, 8);
-        n1hi = _mm_cvtepu8_epi16(n1hi);
+        n1hi = _mm_unpackhi_epi8(n1, zero);
 
         diff = _mm_sub_epi16(m1lo, n1lo);
         sum = _mm_add_epi32(sum, _mm_madd_epi16(diff, diff));
@@ -338,12 +335,10 @@
         n1 = _mm_loadu_si128((__m128i const*)(Cur + 32));
 
         m1lo = _mm_cvtepu8_epi16(m1);
-        m1hi = _mm_srli_si128(m1, 8);
-        m1hi = _mm_cvtepu8_epi16(m1hi);
+        m1hi = _mm_unpackhi_epi8(m1, zero);
 
         n1lo = _mm_cvtepu8_epi16(n1);
-        n1hi = _mm_srli_si128(n1, 8);
-        n1hi = _mm_cvtepu8_epi16(n1hi);
+        n1hi = _mm_unpackhi_epi8(n1, zero);
 
         diff = _mm_sub_epi16(m1lo, n1lo);
         sum = _mm_add_epi32(sum, _mm_madd_epi16(diff, diff));
@@ -355,12 +350,10 @@
         n1 = _mm_loadu_si128((__m128i const*)(Cur + 48));
 
         m1lo = _mm_cvtepu8_epi16(m1);
-        m1hi = _mm_srli_si128(m1, 8);
-        m1hi = _mm_cvtepu8_epi16(m1hi);
+        m1hi = _mm_unpackhi_epi8(m1, zero);
 
         n1lo = _mm_cvtepu8_epi16(n1);
-        n1hi = _mm_srli_si128(n1, 8);
-        n1hi = _mm_cvtepu8_epi16(n1hi);
+        n1hi = _mm_unpackhi_epi8(n1, zero);
 
         diff = _mm_sub_epi16(m1lo, n1lo);
         sum = _mm_add_epi32(sum, _mm_madd_epi16(diff, diff));
@@ -372,8 +365,8 @@
         Cur += strideCur;
     }
 
-    sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
-    sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4));
+    sum = _mm_hadd_epi32(sum, sum);
+    sum = _mm_hadd_epi32(sum, sum);
     return _mm_cvtsi128_si32(sum);
 }
 


More information about the x265-devel mailing list