[x265] [PATCH] pixel8.inc: sad_x4_8 further optimization

praveen at multicorewareinc.com praveen at multicorewareinc.com
Mon Aug 26 14:52:43 CEST 2013


# HG changeset patch
# User praveentiwari
# Date 1377521554 -19800
# Node ID 51c03799e05b23e3f8c3eea3cbab8795f874bbca
# Parent  eebe18eed69aef83c4418e943257ace30afe62f5
pixel8.inc: sad_x4_8 further optimization

diff -r eebe18eed69a -r 51c03799e05b source/common/vec/pixel8.inc
--- a/source/common/vec/pixel8.inc	Mon Aug 26 17:13:51 2013 +0530
+++ b/source/common/vec/pixel8.inc	Mon Aug 26 18:22:34 2013 +0530
@@ -6245,6 +6245,9 @@
 {
     assert((ly % 4) == 0);
     __m128i sum0 = _mm_setzero_si128();
+    __m128i sum1 = _mm_setzero_si128();
+    __m128i sum2 = _mm_setzero_si128();
+    __m128i sum3 = _mm_setzero_si128();
 
     __m128i T00, T01, T02, T03;
     __m128i T10, T11, T12, T13;
@@ -6269,9 +6272,7 @@
         T20 = _mm_sad_epu8(T01, T11);
         T21 = _mm_sad_epu8(T03, T13);
         T21 = _mm_add_epi32(T20, T21);
-        sum0 = _mm_shuffle_epi32(T21, 2);
-        sum0 = _mm_add_epi32(sum0, T21);
-        res[0] = _mm_cvtsi128_si32(sum0);
+        sum0 = _mm_add_epi32(T21, _mm_shuffle_epi32(T21, 2));
 
         T10 = _mm_loadl_epi64((__m128i*)(fref2 + (0) * frefstride));
         T11 = _mm_loadl_epi64((__m128i*)(fref2 + (1) * frefstride));
@@ -6283,9 +6284,7 @@
         T20 = _mm_sad_epu8(T01, T11);
         T21 = _mm_sad_epu8(T03, T13);
         T21 = _mm_add_epi32(T20, T21);
-        sum0 = _mm_shuffle_epi32(T21, 2);
-        sum0 = _mm_add_epi32(sum0, T21);
-        res[1] = _mm_cvtsi128_si32(sum0);
+        sum1 = _mm_add_epi32(T21, _mm_shuffle_epi32(T21, 2));
 
         T10 = _mm_loadl_epi64((__m128i*)(fref3 + (0) * frefstride));
         T11 = _mm_loadl_epi64((__m128i*)(fref3 + (1) * frefstride));
@@ -6297,9 +6296,7 @@
         T20 = _mm_sad_epu8(T01, T11);
         T21 = _mm_sad_epu8(T03, T13);
         T21 = _mm_add_epi32(T20, T21);
-        sum0 = _mm_shuffle_epi32(T21, 2);
-        sum0 = _mm_add_epi32(sum0, T21);
-        res[2] = _mm_cvtsi128_si32(sum0);
+        sum2 = _mm_add_epi32(T21, _mm_shuffle_epi32(T21, 2));
 
         T10 = _mm_loadl_epi64((__m128i*)(fref4 + (0) * frefstride));
         T11 = _mm_loadl_epi64((__m128i*)(fref4 + (1) * frefstride));
@@ -6311,9 +6308,7 @@
         T20 = _mm_sad_epu8(T01, T11);
         T21 = _mm_sad_epu8(T03, T13);
         T21 = _mm_add_epi32(T20, T21);
-        sum0 = _mm_shuffle_epi32(T21, 2);
-        sum0 = _mm_add_epi32(sum0, T21);
-        res[3] = _mm_cvtsi128_si32(sum0);
+        sum3 = _mm_add_epi32(T21, _mm_shuffle_epi32(T21, 2));
     }
     else if (ly == 8)
     {
@@ -6334,9 +6329,7 @@
         T20 = _mm_sad_epu8(T01, T11);
         T21 = _mm_sad_epu8(T03, T13);
         T21 = _mm_add_epi32(T20, T21);
-        sum0 = _mm_shuffle_epi32(T21, 2);
-        sum0 = _mm_add_epi32(sum0, T21);
-        res[0] = _mm_cvtsi128_si32(sum0);
+        sum0 = _mm_add_epi32(T21, _mm_shuffle_epi32(T21, 2));
 
         T10 = _mm_loadl_epi64((__m128i*)(fref2 + (0) * frefstride));
         T11 = _mm_loadl_epi64((__m128i*)(fref2 + (1) * frefstride));
@@ -6348,9 +6341,7 @@
         T20 = _mm_sad_epu8(T01, T11);
         T21 = _mm_sad_epu8(T03, T13);
         T21 = _mm_add_epi32(T20, T21);
-        sum0 = _mm_shuffle_epi32(T21, 2);
-        sum0 = _mm_add_epi32(sum0, T21);
-        res[1] = _mm_cvtsi128_si32(sum0);
+        sum1 = _mm_add_epi32(T21, _mm_shuffle_epi32(T21, 2));
 
         T10 = _mm_loadl_epi64((__m128i*)(fref3 + (0) * frefstride));
         T11 = _mm_loadl_epi64((__m128i*)(fref3 + (1) * frefstride));
@@ -6362,9 +6353,7 @@
         T20 = _mm_sad_epu8(T01, T11);
         T21 = _mm_sad_epu8(T03, T13);
         T21 = _mm_add_epi32(T20, T21);
-        sum0 = _mm_shuffle_epi32(T21, 2);
-        sum0 = _mm_add_epi32(sum0, T21);
-        res[2] = _mm_cvtsi128_si32(sum0);
+        sum2 = _mm_add_epi32(T21, _mm_shuffle_epi32(T21, 2));
 
         T10 = _mm_loadl_epi64((__m128i*)(fref4 + (0) * frefstride));
         T11 = _mm_loadl_epi64((__m128i*)(fref4 + (1) * frefstride));
@@ -6376,9 +6365,7 @@
         T20 = _mm_sad_epu8(T01, T11);
         T21 = _mm_sad_epu8(T03, T13);
         T21 = _mm_add_epi32(T20, T21);
-        sum0 = _mm_shuffle_epi32(T21, 2);
-        sum0 = _mm_add_epi32(sum0, T21);
-        res[3] = _mm_cvtsi128_si32(sum0);
+        sum3 = _mm_add_epi32(T21, _mm_shuffle_epi32(T21, 2));
 
         T00 = _mm_loadl_epi64((__m128i*)(fenc + (4) * FENC_STRIDE));
         T01 = _mm_loadl_epi64((__m128i*)(fenc + (5) * FENC_STRIDE));
@@ -6397,9 +6384,8 @@
         T20 = _mm_sad_epu8(T01, T11);
         T21 = _mm_sad_epu8(T03, T13);
         T21 = _mm_add_epi32(T20, T21);
-        sum0 = _mm_shuffle_epi32(T21, 2);
+        T21 = _mm_add_epi32(T21, _mm_shuffle_epi32(T21, 2));
         sum0 = _mm_add_epi32(sum0, T21);
-        res[0] = res[0] + _mm_cvtsi128_si32(sum0);
 
         T10 = _mm_loadl_epi64((__m128i*)(fref2 + (4) * frefstride));
         T11 = _mm_loadl_epi64((__m128i*)(fref2 + (5) * frefstride));
@@ -6411,9 +6397,8 @@
         T20 = _mm_sad_epu8(T01, T11);
         T21 = _mm_sad_epu8(T03, T13);
         T21 = _mm_add_epi32(T20, T21);
-        sum0 = _mm_shuffle_epi32(T21, 2);
-        sum0 = _mm_add_epi32(sum0, T21);
-        res[1] = res[1] + _mm_cvtsi128_si32(sum0);
+        T21 = _mm_add_epi32(T21, _mm_shuffle_epi32(T21, 2));
+        sum1 = _mm_add_epi32(sum1, T21);
 
         T10 = _mm_loadl_epi64((__m128i*)(fref3 + (4) * frefstride));
         T11 = _mm_loadl_epi64((__m128i*)(fref3 + (5) * frefstride));
@@ -6425,9 +6410,8 @@
         T20 = _mm_sad_epu8(T01, T11);
         T21 = _mm_sad_epu8(T03, T13);
         T21 = _mm_add_epi32(T20, T21);
-        sum0 = _mm_shuffle_epi32(T21, 2);
-        sum0 = _mm_add_epi32(sum0, T21);
-        res[2] = res[2] + _mm_cvtsi128_si32(sum0);
+        T21 = _mm_add_epi32(T21, _mm_shuffle_epi32(T21, 2));
+        sum2 = _mm_add_epi32(sum2, T21);
 
         T10 = _mm_loadl_epi64((__m128i*)(fref4 + (4) * frefstride));
         T11 = _mm_loadl_epi64((__m128i*)(fref4 + (5) * frefstride));
@@ -6439,9 +6423,8 @@
         T20 = _mm_sad_epu8(T01, T11);
         T21 = _mm_sad_epu8(T03, T13);
         T21 = _mm_add_epi32(T20, T21);
-        sum0 = _mm_shuffle_epi32(T21, 2);
-        sum0 = _mm_add_epi32(sum0, T21);
-        res[3] = res[3] + _mm_cvtsi128_si32(sum0);
+        T21 = _mm_add_epi32(T21, _mm_shuffle_epi32(T21, 2));
+        sum3 = _mm_add_epi32(sum3, T21);
     }
     else if (ly == 16)
     {
@@ -6462,9 +6445,7 @@
         T20 = _mm_sad_epu8(T01, T11);
         T21 = _mm_sad_epu8(T03, T13);
         T21 = _mm_add_epi32(T20, T21);
-        sum0 = _mm_shuffle_epi32(T21, 2);
-        sum0 = _mm_add_epi32(sum0, T21);
-        res[0] = _mm_cvtsi128_si32(sum0);
+        sum0 = _mm_add_epi32(T21, _mm_shuffle_epi32(T21, 2));
 
         T10 = _mm_loadl_epi64((__m128i*)(fref2 + (0) * frefstride));
         T11 = _mm_loadl_epi64((__m128i*)(fref2 + (1) * frefstride));
@@ -6476,9 +6457,7 @@
         T20 = _mm_sad_epu8(T01, T11);
         T21 = _mm_sad_epu8(T03, T13);
         T21 = _mm_add_epi32(T20, T21);
-        sum0 = _mm_shuffle_epi32(T21, 2);
-        sum0 = _mm_add_epi32(sum0, T21);
-        res[1] = _mm_cvtsi128_si32(sum0);
+        sum1 = _mm_add_epi32(T21, _mm_shuffle_epi32(T21, 2));
 
         T10 = _mm_loadl_epi64((__m128i*)(fref3 + (0) * frefstride));
         T11 = _mm_loadl_epi64((__m128i*)(fref3 + (1) * frefstride));
@@ -6490,9 +6469,7 @@
         T20 = _mm_sad_epu8(T01, T11);
         T21 = _mm_sad_epu8(T03, T13);
         T21 = _mm_add_epi32(T20, T21);
-        sum0 = _mm_shuffle_epi32(T21, 2);
-        sum0 = _mm_add_epi32(sum0, T21);
-        res[2] = _mm_cvtsi128_si32(sum0);
+        sum2 = _mm_add_epi32(T21, _mm_shuffle_epi32(T21, 2));
 
         T10 = _mm_loadl_epi64((__m128i*)(fref4 + (0) * frefstride));
         T11 = _mm_loadl_epi64((__m128i*)(fref4 + (1) * frefstride));
@@ -6504,9 +6481,7 @@
         T20 = _mm_sad_epu8(T01, T11);
         T21 = _mm_sad_epu8(T03, T13);
         T21 = _mm_add_epi32(T20, T21);
-        sum0 = _mm_shuffle_epi32(T21, 2);
-        sum0 = _mm_add_epi32(sum0, T21);
-        res[3] = _mm_cvtsi128_si32(sum0);
+        sum3 = _mm_add_epi32(T21, _mm_shuffle_epi32(T21, 2));
 
         T00 = _mm_loadl_epi64((__m128i*)(fenc + (4) * FENC_STRIDE));
         T01 = _mm_loadl_epi64((__m128i*)(fenc + (5) * FENC_STRIDE));
@@ -6525,9 +6500,8 @@
         T20 = _mm_sad_epu8(T01, T11);
         T21 = _mm_sad_epu8(T03, T13);
         T21 = _mm_add_epi32(T20, T21);
-        sum0 = _mm_shuffle_epi32(T21, 2);
+        T21 = _mm_add_epi32(T21, _mm_shuffle_epi32(T21, 2));
         sum0 = _mm_add_epi32(sum0, T21);
-        res[0] = res[0] + _mm_cvtsi128_si32(sum0);
 
         T10 = _mm_loadl_epi64((__m128i*)(fref2 + (4) * frefstride));
         T11 = _mm_loadl_epi64((__m128i*)(fref2 + (5) * frefstride));
@@ -6539,9 +6513,8 @@
         T20 = _mm_sad_epu8(T01, T11);
         T21 = _mm_sad_epu8(T03, T13);
         T21 = _mm_add_epi32(T20, T21);
-        sum0 = _mm_shuffle_epi32(T21, 2);
-        sum0 = _mm_add_epi32(sum0, T21);
-        res[1] = res[1] + _mm_cvtsi128_si32(sum0);
+        T21 = _mm_add_epi32(T21, _mm_shuffle_epi32(T21, 2));
+        sum1 = _mm_add_epi32(sum1, T21);
 
         T10 = _mm_loadl_epi64((__m128i*)(fref3 + (4) * frefstride));
         T11 = _mm_loadl_epi64((__m128i*)(fref3 + (5) * frefstride));
@@ -6553,9 +6526,8 @@
         T20 = _mm_sad_epu8(T01, T11);
         T21 = _mm_sad_epu8(T03, T13);
         T21 = _mm_add_epi32(T20, T21);
-        sum0 = _mm_shuffle_epi32(T21, 2);
-        sum0 = _mm_add_epi32(sum0, T21);
-        res[2] = res[2] + _mm_cvtsi128_si32(sum0);
+        T21 = _mm_add_epi32(T21, _mm_shuffle_epi32(T21, 2));
+        sum2 = _mm_add_epi32(sum2, T21);
 
         T10 = _mm_loadl_epi64((__m128i*)(fref4 + (4) * frefstride));
         T11 = _mm_loadl_epi64((__m128i*)(fref4 + (5) * frefstride));
@@ -6567,9 +6539,8 @@
         T20 = _mm_sad_epu8(T01, T11);
         T21 = _mm_sad_epu8(T03, T13);
         T21 = _mm_add_epi32(T20, T21);
-        sum0 = _mm_shuffle_epi32(T21, 2);
-        sum0 = _mm_add_epi32(sum0, T21);
-        res[3] = res[3] + _mm_cvtsi128_si32(sum0);
+        T21 = _mm_add_epi32(T21, _mm_shuffle_epi32(T21, 2));
+        sum3 = _mm_add_epi32(sum3, T21);
 
         T00 = _mm_loadl_epi64((__m128i*)(fenc + (8) * FENC_STRIDE));
         T01 = _mm_loadl_epi64((__m128i*)(fenc + (9) * FENC_STRIDE));
@@ -6588,9 +6559,8 @@
         T20 = _mm_sad_epu8(T01, T11);
         T21 = _mm_sad_epu8(T03, T13);
         T21 = _mm_add_epi32(T20, T21);
-        sum0 = _mm_shuffle_epi32(T21, 2);
+        T21 = _mm_add_epi32(T21, _mm_shuffle_epi32(T21, 2));
         sum0 = _mm_add_epi32(sum0, T21);
-        res[0] = res[0] + _mm_cvtsi128_si32(sum0);
 
         T10 = _mm_loadl_epi64((__m128i*)(fref2 + (8) * frefstride));
         T11 = _mm_loadl_epi64((__m128i*)(fref2 + (9) * frefstride));
@@ -6602,9 +6572,8 @@
         T20 = _mm_sad_epu8(T01, T11);
         T21 = _mm_sad_epu8(T03, T13);
         T21 = _mm_add_epi32(T20, T21);
-        sum0 = _mm_shuffle_epi32(T21, 2);
-        sum0 = _mm_add_epi32(sum0, T21);
-        res[1] = res[1] + _mm_cvtsi128_si32(sum0);
+        T21 = _mm_add_epi32(T21, _mm_shuffle_epi32(T21, 2));
+        sum1 = _mm_add_epi32(sum1, T21);
 
         T10 = _mm_loadl_epi64((__m128i*)(fref3 + (8) * frefstride));
         T11 = _mm_loadl_epi64((__m128i*)(fref3 + (9) * frefstride));
@@ -6616,9 +6585,8 @@
         T20 = _mm_sad_epu8(T01, T11);
         T21 = _mm_sad_epu8(T03, T13);
         T21 = _mm_add_epi32(T20, T21);
-        sum0 = _mm_shuffle_epi32(T21, 2);
-        sum0 = _mm_add_epi32(sum0, T21);
-        res[2] = res[2] + _mm_cvtsi128_si32(sum0);
+        T21 = _mm_add_epi32(T21, _mm_shuffle_epi32(T21, 2));
+        sum2 = _mm_add_epi32(sum2, T21);
 
         T10 = _mm_loadl_epi64((__m128i*)(fref4 + (8) * frefstride));
         T11 = _mm_loadl_epi64((__m128i*)(fref4 + (9) * frefstride));
@@ -6630,9 +6598,8 @@
         T20 = _mm_sad_epu8(T01, T11);
         T21 = _mm_sad_epu8(T03, T13);
         T21 = _mm_add_epi32(T20, T21);
-        sum0 = _mm_shuffle_epi32(T21, 2);
-        sum0 = _mm_add_epi32(sum0, T21);
-        res[3] = res[3] + _mm_cvtsi128_si32(sum0);
+        T21 = _mm_add_epi32(T21, _mm_shuffle_epi32(T21, 2));
+        sum3 = _mm_add_epi32(sum3, T21);
 
         T00 = _mm_loadl_epi64((__m128i*)(fenc + (12) * FENC_STRIDE));
         T01 = _mm_loadl_epi64((__m128i*)(fenc + (13) * FENC_STRIDE));
@@ -6651,9 +6618,8 @@
         T20 = _mm_sad_epu8(T01, T11);
         T21 = _mm_sad_epu8(T03, T13);
         T21 = _mm_add_epi32(T20, T21);
-        sum0 = _mm_shuffle_epi32(T21, 2);
+        T21 = _mm_add_epi32(T21, _mm_shuffle_epi32(T21, 2));
         sum0 = _mm_add_epi32(sum0, T21);
-        res[0] = res[0] + _mm_cvtsi128_si32(sum0);
 
         T10 = _mm_loadl_epi64((__m128i*)(fref2 + (12) * frefstride));
         T11 = _mm_loadl_epi64((__m128i*)(fref2 + (13) * frefstride));
@@ -6665,9 +6631,8 @@
         T20 = _mm_sad_epu8(T01, T11);
         T21 = _mm_sad_epu8(T03, T13);
         T21 = _mm_add_epi32(T20, T21);
-        sum0 = _mm_shuffle_epi32(T21, 2);
-        sum0 = _mm_add_epi32(sum0, T21);
-        res[1] = res[1] + _mm_cvtsi128_si32(sum0);
+        T21 = _mm_add_epi32(T21, _mm_shuffle_epi32(T21, 2));
+        sum1 = _mm_add_epi32(sum1, T21);
 
         T10 = _mm_loadl_epi64((__m128i*)(fref3 + (12) * frefstride));
         T11 = _mm_loadl_epi64((__m128i*)(fref3 + (13) * frefstride));
@@ -6679,9 +6644,8 @@
         T20 = _mm_sad_epu8(T01, T11);
         T21 = _mm_sad_epu8(T03, T13);
         T21 = _mm_add_epi32(T20, T21);
-        sum0 = _mm_shuffle_epi32(T21, 2);
-        sum0 = _mm_add_epi32(sum0, T21);
-        res[2] = res[2] + _mm_cvtsi128_si32(sum0);
+        T21 = _mm_add_epi32(T21, _mm_shuffle_epi32(T21, 2));
+        sum2 = _mm_add_epi32(sum2, T21);
 
         T10 = _mm_loadl_epi64((__m128i*)(fref4 + (12) * frefstride));
         T11 = _mm_loadl_epi64((__m128i*)(fref4 + (13) * frefstride));
@@ -6693,13 +6657,11 @@
         T20 = _mm_sad_epu8(T01, T11);
         T21 = _mm_sad_epu8(T03, T13);
         T21 = _mm_add_epi32(T20, T21);
-        sum0 = _mm_shuffle_epi32(T21, 2);
-        sum0 = _mm_add_epi32(sum0, T21);
-        res[3] = res[3] + _mm_cvtsi128_si32(sum0);
+        T21 = _mm_add_epi32(T21, _mm_shuffle_epi32(T21, 2));
+        sum3 = _mm_add_epi32(sum3, T21);
     }
     else if ((ly % 8) == 0)
     {
-        res[0] = res[1] = res[2] = res[3] = 0;
         for (int i = 0; i < ly; i += 8)
         {
             T00 = _mm_loadl_epi64((__m128i*)(fenc + (i + 0) * FENC_STRIDE));
@@ -6719,9 +6681,8 @@
             T20 = _mm_sad_epu8(T01, T11);
             T21 = _mm_sad_epu8(T03, T13);
             T21 = _mm_add_epi32(T20, T21);
-            sum0 = _mm_shuffle_epi32(T21, 2);
+            T21 = _mm_add_epi32(T21, _mm_shuffle_epi32(T21, 2));
             sum0 = _mm_add_epi32(sum0, T21);
-            res[0] = res[0] + _mm_cvtsi128_si32(sum0);
 
             T10 = _mm_loadl_epi64((__m128i*)(fref2 + (i + 0) * frefstride));
             T11 = _mm_loadl_epi64((__m128i*)(fref2 + (i + 1) * frefstride));
@@ -6733,9 +6694,8 @@
             T20 = _mm_sad_epu8(T01, T11);
             T21 = _mm_sad_epu8(T03, T13);
             T21 = _mm_add_epi32(T20, T21);
-            sum0 = _mm_shuffle_epi32(T21, 2);
-            sum0 = _mm_add_epi32(sum0, T21);
-            res[1] = res[1] + _mm_cvtsi128_si32(sum0);
+            T21 = _mm_add_epi32(T21, _mm_shuffle_epi32(T21, 2));
+            sum1 = _mm_add_epi32(sum1, T21);
 
             T10 = _mm_loadl_epi64((__m128i*)(fref3 + (i + 0) * frefstride));
             T11 = _mm_loadl_epi64((__m128i*)(fref3 + (i + 1) * frefstride));
@@ -6747,9 +6707,8 @@
             T20 = _mm_sad_epu8(T01, T11);
             T21 = _mm_sad_epu8(T03, T13);
             T21 = _mm_add_epi32(T20, T21);
-            sum0 = _mm_shuffle_epi32(T21, 2);
-            sum0 = _mm_add_epi32(sum0, T21);
-            res[2] = res[2] + _mm_cvtsi128_si32(sum0);
+            T21 = _mm_add_epi32(T21, _mm_shuffle_epi32(T21, 2));
+            sum2 = _mm_add_epi32(sum2, T21);
 
             T10 = _mm_loadl_epi64((__m128i*)(fref4 + (i + 0) * frefstride));
             T11 = _mm_loadl_epi64((__m128i*)(fref4 + (i + 1) * frefstride));
@@ -6761,9 +6720,8 @@
             T20 = _mm_sad_epu8(T01, T11);
             T21 = _mm_sad_epu8(T03, T13);
             T21 = _mm_add_epi32(T20, T21);
-            sum0 = _mm_shuffle_epi32(T21, 2);
-            sum0 = _mm_add_epi32(sum0, T21);
-            res[3] = res[3] + _mm_cvtsi128_si32(sum0);
+            T21 = _mm_add_epi32(T21, _mm_shuffle_epi32(T21, 2));
+            sum3 = _mm_add_epi32(sum3, T21);
 
             T00 = _mm_loadl_epi64((__m128i*)(fenc + (i + 4) * FENC_STRIDE));
             T01 = _mm_loadl_epi64((__m128i*)(fenc + (i + 5) * FENC_STRIDE));
@@ -6782,9 +6740,8 @@
             T20 = _mm_sad_epu8(T01, T11);
             T21 = _mm_sad_epu8(T03, T13);
             T21 = _mm_add_epi32(T20, T21);
-            sum0 = _mm_shuffle_epi32(T21, 2);
+            T21 = _mm_add_epi32(T21, _mm_shuffle_epi32(T21, 2));
             sum0 = _mm_add_epi32(sum0, T21);
-            res[0] = res[0] + _mm_cvtsi128_si32(sum0);
 
             T10 = _mm_loadl_epi64((__m128i*)(fref2 + (i + 4) * frefstride));
             T11 = _mm_loadl_epi64((__m128i*)(fref2 + (i + 5) * frefstride));
@@ -6796,9 +6753,8 @@
             T20 = _mm_sad_epu8(T01, T11);
             T21 = _mm_sad_epu8(T03, T13);
             T21 = _mm_add_epi32(T20, T21);
-            sum0 = _mm_shuffle_epi32(T21, 2);
-            sum0 = _mm_add_epi32(sum0, T21);
-            res[1] = res[1] + _mm_cvtsi128_si32(sum0);
+            T21 = _mm_add_epi32(T21, _mm_shuffle_epi32(T21, 2));
+            sum1 = _mm_add_epi32(sum1, T21);
 
             T10 = _mm_loadl_epi64((__m128i*)(fref3 + (i + 4) * frefstride));
             T11 = _mm_loadl_epi64((__m128i*)(fref3 + (i + 5) * frefstride));
@@ -6810,9 +6766,8 @@
             T20 = _mm_sad_epu8(T01, T11);
             T21 = _mm_sad_epu8(T03, T13);
             T21 = _mm_add_epi32(T20, T21);
-            sum0 = _mm_shuffle_epi32(T21, 2);
-            sum0 = _mm_add_epi32(sum0, T21);
-            res[2] = res[2] + _mm_cvtsi128_si32(sum0);
+            T21 = _mm_add_epi32(T21, _mm_shuffle_epi32(T21, 2));
+            sum2 = _mm_add_epi32(sum2, T21);
 
             T10 = _mm_loadl_epi64((__m128i*)(fref4 + (i + 4) * frefstride));
             T11 = _mm_loadl_epi64((__m128i*)(fref4 + (i + 5) * frefstride));
@@ -6824,14 +6779,12 @@
             T20 = _mm_sad_epu8(T01, T11);
             T21 = _mm_sad_epu8(T03, T13);
             T21 = _mm_add_epi32(T20, T21);
-            sum0 = _mm_shuffle_epi32(T21, 2);
-            sum0 = _mm_add_epi32(sum0, T21);
-            res[3] = res[3] + _mm_cvtsi128_si32(sum0);
+            T21 = _mm_add_epi32(T21, _mm_shuffle_epi32(T21, 2));
+            sum3 = _mm_add_epi32(sum3, T21);
         }
     }
     else
     {
-        res[0] = res[1] = res[2] = res[3] = 0;
         for (int i = 0; i < ly; i += 4)
         {
             T00 = _mm_loadl_epi64((__m128i*)(fenc + (i + 0) * FENC_STRIDE));
@@ -6851,9 +6804,8 @@
             T20 = _mm_sad_epu8(T01, T11);
             T21 = _mm_sad_epu8(T03, T13);
             T21 = _mm_add_epi32(T20, T21);
-            sum0 = _mm_shuffle_epi32(T21, 2);
+            T21 = _mm_add_epi32(T21, _mm_shuffle_epi32(T21, 2));
             sum0 = _mm_add_epi32(sum0, T21);
-            res[0] = res[0] + _mm_cvtsi128_si32(sum0);
 
             T10 = _mm_loadl_epi64((__m128i*)(fref2 + (i + 0) * frefstride));
             T11 = _mm_loadl_epi64((__m128i*)(fref2 + (i + 1) * frefstride));
@@ -6865,9 +6817,8 @@
             T20 = _mm_sad_epu8(T01, T11);
             T21 = _mm_sad_epu8(T03, T13);
             T21 = _mm_add_epi32(T20, T21);
-            sum0 = _mm_shuffle_epi32(T21, 2);
-            sum0 = _mm_add_epi32(sum0, T21);
-            res[1] = res[1] + _mm_cvtsi128_si32(sum0);
+            T21 = _mm_add_epi32(T21, _mm_shuffle_epi32(T21, 2));
+            sum1 = _mm_add_epi32(sum1, T21);
 
             T10 = _mm_loadl_epi64((__m128i*)(fref3 + (i + 0) * frefstride));
             T11 = _mm_loadl_epi64((__m128i*)(fref3 + (i + 1) * frefstride));
@@ -6879,9 +6830,8 @@
             T20 = _mm_sad_epu8(T01, T11);
             T21 = _mm_sad_epu8(T03, T13);
             T21 = _mm_add_epi32(T20, T21);
-            sum0 = _mm_shuffle_epi32(T21, 2);
-            sum0 = _mm_add_epi32(sum0, T21);
-            res[2] = res[2] + _mm_cvtsi128_si32(sum0);
+            T21 = _mm_add_epi32(T21, _mm_shuffle_epi32(T21, 2));
+            sum2 = _mm_add_epi32(sum2, T21);
 
             T10 = _mm_loadl_epi64((__m128i*)(fref4 + (i + 0) * frefstride));
             T11 = _mm_loadl_epi64((__m128i*)(fref4 + (i + 1) * frefstride));
@@ -6893,11 +6843,15 @@
             T20 = _mm_sad_epu8(T01, T11);
             T21 = _mm_sad_epu8(T03, T13);
             T21 = _mm_add_epi32(T20, T21);
-            sum0 = _mm_shuffle_epi32(T21, 2);
-            sum0 = _mm_add_epi32(sum0, T21);
-            res[3] = res[3] + _mm_cvtsi128_si32(sum0);
+            T21 = _mm_add_epi32(T21, _mm_shuffle_epi32(T21, 2));
+            sum3 = _mm_add_epi32(sum3, T21);
         }
     }
+
+    res[0] = _mm_cvtsi128_si32(sum0);
+    res[1] = _mm_cvtsi128_si32(sum1);
+    res[2] = _mm_cvtsi128_si32(sum2);
+    res[3] = _mm_cvtsi128_si32(sum3);
 }
 
 #endif /* if HAVE_MMX */


More information about the x265-devel mailing list