[x265] [PATCH] pixel8.inc: sad_x3_4 more optimization

praveen at multicorewareinc.com praveen at multicorewareinc.com
Sun Aug 25 11:26:58 CEST 2013


# HG changeset patch
# User praveentiwari
# Date 1377422806 -19800
# Node ID 3a714ad926c0cfaa21bd158fa98562fc9a204b7a
# Parent  6cd20423923d12528405dd19f66dff0b19154f62
pixel8.inc: sad_x3_4 more optimization

diff -r 6cd20423923d -r 3a714ad926c0 source/common/vec/pixel8.inc
--- a/source/common/vec/pixel8.inc	Sun Aug 25 14:28:00 2013 +0530
+++ b/source/common/vec/pixel8.inc	Sun Aug 25 14:56:46 2013 +0530
@@ -1365,20 +1365,20 @@
         T20 = _mm_sad_epu8(R00, R01);
         sum0 = _mm_shuffle_epi32(T20, 2);
         sum0 = _mm_add_epi32(sum0, T20);
-        res[0] = _mm_extract_epi32(sum0, 0);
+        res[0] = _mm_cvtsi128_si32(sum0, 0);
 
         T20 = _mm_sad_epu8(R00, R02);
         sum0 = _mm_shuffle_epi32(T20, 2);
         sum0 = _mm_add_epi32(sum0, T20);
-        res[1] = _mm_extract_epi32(sum0, 0);
+        res[1] = _mm_cvtsi128_si32(sum0, 0);
 
         T20 = _mm_sad_epu8(R00, R03);
         sum0 = _mm_shuffle_epi32(T20, 2);
         sum0 = _mm_add_epi32(sum0, T20);
-        res[2] = _mm_extract_epi32(sum0, 0);
+        res[2] = _mm_cvtsi128_si32(sum0, 0);
     }
 
-    if (ly == 8)
+    else if (ly == 8)
     {
         res[0] = res[1] = res[2] = 0;
         T00 = _mm_loadl_epi64((__m128i*)(fenc + (0) * FENC_STRIDE));
@@ -1416,17 +1416,17 @@
         T20 = _mm_sad_epu8(R00, R01);
         sum0 = _mm_shuffle_epi32(T20, 2);
         sum0 = _mm_add_epi32(sum0, T20);
-        res[0] = _mm_extract_epi32(sum0, 0);
+        res[0] = _mm_cvtsi128_si32(sum0, 0);
 
         T20 = _mm_sad_epu8(R00, R02);
         sum0 = _mm_shuffle_epi32(T20, 2);
         sum0 = _mm_add_epi32(sum0, T20);
-        res[1] = _mm_extract_epi32(sum0, 0);
+        res[1] = _mm_cvtsi128_si32(sum0, 0);
 
         T20 = _mm_sad_epu8(R00, R03);
         sum0 = _mm_shuffle_epi32(T20, 2);
         sum0 = _mm_add_epi32(sum0, T20);
-        res[2] = _mm_extract_epi32(sum0, 0);
+        res[2] = _mm_cvtsi128_si32(sum0, 0);
 
         T00 = _mm_loadl_epi64((__m128i*)(fenc + (4) * FENC_STRIDE));
         T01 = _mm_loadl_epi64((__m128i*)(fenc + (5) * FENC_STRIDE));
@@ -1463,20 +1463,20 @@
         T20 = _mm_sad_epu8(R00, R01);
         sum0 = _mm_shuffle_epi32(T20, 2);
         sum0 = _mm_add_epi32(sum0, T20);
-        res[0] = res[0] + _mm_extract_epi32(sum0, 0);
+        res[0] = res[0] + _mm_cvtsi128_si32(sum0, 0);
 
         T20 = _mm_sad_epu8(R00, R02);
         sum0 = _mm_shuffle_epi32(T20, 2);
         sum0 = _mm_add_epi32(sum0, T20);
-        res[1] = res[1] + _mm_extract_epi32(sum0, 0);
+        res[1] = res[1] + _mm_cvtsi128_si32(sum0, 0);
 
         T20 = _mm_sad_epu8(R00, R03);
         sum0 = _mm_shuffle_epi32(T20, 2);
         sum0 = _mm_add_epi32(sum0, T20);
-        res[2] = res[2] + _mm_extract_epi32(sum0, 0);
+        res[2] = res[2] + _mm_cvtsi128_si32(sum0, 0);
     }
 
-    if (ly == 16)
+    else if (ly == 16)
     {
         res[0] = res[1] = res[2] = 0;
         T00 = _mm_loadl_epi64((__m128i*)(fenc + (0) * FENC_STRIDE));
@@ -1514,17 +1514,17 @@
         T20 = _mm_sad_epu8(R00, R01);
         sum0 = _mm_shuffle_epi32(T20, 2);
         sum0 = _mm_add_epi32(sum0, T20);
-        res[0] = _mm_extract_epi32(sum0, 0);
+        res[0] = _mm_cvtsi128_si32(sum0, 0);
 
         T20 = _mm_sad_epu8(R00, R02);
         sum0 = _mm_shuffle_epi32(T20, 2);
         sum0 = _mm_add_epi32(sum0, T20);
-        res[1] = _mm_extract_epi32(sum0, 0);
+        res[1] = _mm_cvtsi128_si32(sum0, 0);
 
         T20 = _mm_sad_epu8(R00, R03);
         sum0 = _mm_shuffle_epi32(T20, 2);
         sum0 = _mm_add_epi32(sum0, T20);
-        res[2] = _mm_extract_epi32(sum0, 0);
+        res[2] = _mm_cvtsi128_si32(sum0, 0);
 
         T00 = _mm_loadl_epi64((__m128i*)(fenc + (4) * FENC_STRIDE));
         T01 = _mm_loadl_epi64((__m128i*)(fenc + (5) * FENC_STRIDE));
@@ -1561,17 +1561,17 @@
         T20 = _mm_sad_epu8(R00, R01);
         sum0 = _mm_shuffle_epi32(T20, 2);
         sum0 = _mm_add_epi32(sum0, T20);
-        res[0] = res[0] + _mm_extract_epi32(sum0, 0);
+        res[0] = res[0] + _mm_cvtsi128_si32(sum0, 0);
 
         T20 = _mm_sad_epu8(R00, R02);
         sum0 = _mm_shuffle_epi32(T20, 2);
         sum0 = _mm_add_epi32(sum0, T20);
-        res[1] = res[1] + _mm_extract_epi32(sum0, 0);
+        res[1] = res[1] + _mm_cvtsi128_si32(sum0, 0);
 
         T20 = _mm_sad_epu8(R00, R03);
         sum0 = _mm_shuffle_epi32(T20, 2);
         sum0 = _mm_add_epi32(sum0, T20);
-        res[2] = res[2] + _mm_extract_epi32(sum0, 0);
+        res[2] = res[2] + _mm_cvtsi128_si32(sum0, 0);
 
         T00 = _mm_loadl_epi64((__m128i*)(fenc + (8) * FENC_STRIDE));
         T01 = _mm_loadl_epi64((__m128i*)(fenc + (9) * FENC_STRIDE));
@@ -1608,17 +1608,17 @@
         T20 = _mm_sad_epu8(R00, R01);
         sum0 = _mm_shuffle_epi32(T20, 2);
         sum0 = _mm_add_epi32(sum0, T20);
-        res[0] = res[0] + _mm_extract_epi32(sum0, 0);
+        res[0] = res[0] + _mm_cvtsi128_si32(sum0, 0);
 
         T20 = _mm_sad_epu8(R00, R02);
         sum0 = _mm_shuffle_epi32(T20, 2);
         sum0 = _mm_add_epi32(sum0, T20);
-        res[1] = res[1] + _mm_extract_epi32(sum0, 0);
+        res[1] = res[1] + _mm_cvtsi128_si32(sum0, 0);
 
         T20 = _mm_sad_epu8(R00, R03);
         sum0 = _mm_shuffle_epi32(T20, 2);
         sum0 = _mm_add_epi32(sum0, T20);
-        res[2] = res[2] + _mm_extract_epi32(sum0, 0);
+        res[2] = res[2] + _mm_cvtsi128_si32(sum0, 0);
 
         T00 = _mm_loadl_epi64((__m128i*)(fenc + (12) * FENC_STRIDE));
         T01 = _mm_loadl_epi64((__m128i*)(fenc + (13) * FENC_STRIDE));
@@ -1655,17 +1655,17 @@
         T20 = _mm_sad_epu8(R00, R01);
         sum0 = _mm_shuffle_epi32(T20, 2);
         sum0 = _mm_add_epi32(sum0, T20);
-        res[0] = res[0] + _mm_extract_epi32(sum0, 0);
+        res[0] = res[0] + _mm_cvtsi128_si32(sum0, 0);
 
         T20 = _mm_sad_epu8(R00, R02);
         sum0 = _mm_shuffle_epi32(T20, 2);
         sum0 = _mm_add_epi32(sum0, T20);
-        res[1] = res[1] + _mm_extract_epi32(sum0, 0);
+        res[1] = res[1] + _mm_cvtsi128_si32(sum0, 0);
 
         T20 = _mm_sad_epu8(R00, R03);
         sum0 = _mm_shuffle_epi32(T20, 2);
         sum0 = _mm_add_epi32(sum0, T20);
-        res[2] = res[2] + _mm_extract_epi32(sum0, 0);
+        res[2] = res[2] + _mm_cvtsi128_si32(sum0, 0);
     }
     else if ((ly % 8) == 0)
     {
@@ -1707,17 +1707,17 @@
             T20 = _mm_sad_epu8(R00, R01);
             sum0 = _mm_shuffle_epi32(T20, 2);
             sum0 = _mm_add_epi32(sum0, T20);
-            res[0] = res[0] + _mm_extract_epi32(sum0, 0);
+            res[0] = res[0] + _mm_cvtsi128_si32(sum0, 0);
 
             T20 = _mm_sad_epu8(R00, R02);
             sum0 = _mm_shuffle_epi32(T20, 2);
             sum0 = _mm_add_epi32(sum0, T20);
-            res[1] = res[1] + _mm_extract_epi32(sum0, 0);
+            res[1] = res[1] + _mm_cvtsi128_si32(sum0, 0);
 
             T20 = _mm_sad_epu8(R00, R03);
             sum0 = _mm_shuffle_epi32(T20, 2);
             sum0 = _mm_add_epi32(sum0, T20);
-            res[2] = res[2] + _mm_extract_epi32(sum0, 0);
+            res[2] = res[2] + _mm_cvtsi128_si32(sum0, 0);
 
             T00 = _mm_loadl_epi64((__m128i*)(fenc + (i + 4) * FENC_STRIDE));
             T01 = _mm_loadl_epi64((__m128i*)(fenc + (i + 5) * FENC_STRIDE));
@@ -1754,17 +1754,17 @@
             T20 = _mm_sad_epu8(R00, R01);
             sum0 = _mm_shuffle_epi32(T20, 2);
             sum0 = _mm_add_epi32(sum0, T20);
-            res[0] = res[0] + _mm_extract_epi32(sum0, 0);
+            res[0] = res[0] + _mm_cvtsi128_si32(sum0, 0);
 
             T20 = _mm_sad_epu8(R00, R02);
             sum0 = _mm_shuffle_epi32(T20, 2);
             sum0 = _mm_add_epi32(sum0, T20);
-            res[1] = res[1] + _mm_extract_epi32(sum0, 0);
+            res[1] = res[1] + _mm_cvtsi128_si32(sum0, 0);
 
             T20 = _mm_sad_epu8(R00, R03);
             sum0 = _mm_shuffle_epi32(T20, 2);
             sum0 = _mm_add_epi32(sum0, T20);
-            res[2] = res[2] + _mm_extract_epi32(sum0, 0);
+            res[2] = res[2] + _mm_cvtsi128_si32(sum0, 0);
         }
     }
     else
@@ -1807,17 +1807,17 @@
             T20 = _mm_sad_epu8(R00, R01);
             sum0 = _mm_shuffle_epi32(T20, 2);
             sum0 = _mm_add_epi32(sum0, T20);
-            res[0] = res[0] + _mm_extract_epi32(sum0, 0);
+            res[0] = res[0] + _mm_cvtsi128_si32(sum0, 0);
 
             T20 = _mm_sad_epu8(R00, R02);
             sum0 = _mm_shuffle_epi32(T20, 2);
             sum0 = _mm_add_epi32(sum0, T20);
-            res[1] = res[1] + _mm_extract_epi32(sum0, 0);
+            res[1] = res[1] + _mm_cvtsi128_si32(sum0, 0);
 
             T20 = _mm_sad_epu8(R00, R03);
             sum0 = _mm_shuffle_epi32(T20, 2);
             sum0 = _mm_add_epi32(sum0, T20);
-            res[2] = res[2] + _mm_extract_epi32(sum0, 0);
+            res[2] = res[2] + _mm_cvtsi128_si32(sum0, 0);
         }
     }
 }


More information about the x265-devel mailing list