[x265] [PATCH] pixel8.inc: sad_8 enabled MMX code for 32-build except VC

praveen at multicorewareinc.com praveen at multicorewareinc.com
Mon Aug 26 08:47:08 CEST 2013


# HG changeset patch
# User praveentiwari
# Date 1377499616 -19800
# Node ID d6b8b308aec28b43a9ecc530c6f5763bc374ad62
# Parent  56cfd7be4b0ebd26a70c1737af3aa23032e00056
pixel8.inc: sad_8 enabled MMX code for 32-build except VC

diff -r 56cfd7be4b0e -r d6b8b308aec2 source/common/vec/pixel8.inc
--- a/source/common/vec/pixel8.inc	Mon Aug 26 12:07:43 2013 +0530
+++ b/source/common/vec/pixel8.inc	Mon Aug 26 12:16:56 2013 +0530
@@ -436,7 +436,180 @@
 
 #endif /* if HAVE_MMX */
 
-#ifdef X86_64
+#if HAVE_MMX
+template<int ly>
+int sad_8(pixel * fenc, intptr_t fencstride, pixel * fref, intptr_t frefstride)
+{
+    assert((ly % 4) == 0);
+
+    __m64 sum0 = _mm_setzero_si64();
+
+    __m64 T00, T01, T02, T03;
+    __m64 T10, T11, T12, T13;
+    __m64 T20, T21, T22, T23;
+
+    if ((ly % 16) == 0)
+    {
+        for (int i = 0; i < ly; i += 16)
+        {
+            T00 = (*(__m64*)(fenc + (i + 0) * fencstride));
+            T01 = (*(__m64*)(fenc + (i + 1) * fencstride));
+            T02 = (*(__m64*)(fenc + (i + 2) * fencstride));
+            T03 = (*(__m64*)(fenc + (i + 3) * fencstride));
+
+            T10 = (*(__m64*)(fref + (i + 0) * frefstride));
+            T11 = (*(__m64*)(fref + (i + 1) * frefstride));
+            T12 = (*(__m64*)(fref + (i + 2) * frefstride));
+            T13 = (*(__m64*)(fref + (i + 3) * frefstride));
+
+            T20 = _mm_sad_pu8(T00, T10);
+            T21 = _mm_sad_pu8(T01, T11);
+            T22 = _mm_sad_pu8(T02, T12);
+            T23 = _mm_sad_pu8(T03, T13);
+
+            sum0 = _mm_add_pi16(sum0, T20);
+            sum0 = _mm_add_pi16(sum0, T21);
+            sum0 = _mm_add_pi16(sum0, T22);
+            sum0 = _mm_add_pi16(sum0, T23);
+
+            T00 = (*(__m64*)(fenc + (i + 4) * fencstride));
+            T01 = (*(__m64*)(fenc + (i + 5) * fencstride));
+            T02 = (*(__m64*)(fenc + (i + 6) * fencstride));
+            T03 = (*(__m64*)(fenc + (i + 7) * fencstride));
+
+            T10 = (*(__m64*)(fref + (i + 4) * frefstride));
+            T11 = (*(__m64*)(fref + (i + 5) * frefstride));
+            T12 = (*(__m64*)(fref + (i + 6) * frefstride));
+            T13 = (*(__m64*)(fref + (i + 7) * frefstride));
+
+            T20 = _mm_sad_pu8(T00, T10);
+            T21 = _mm_sad_pu8(T01, T11);
+            T22 = _mm_sad_pu8(T02, T12);
+            T23 = _mm_sad_pu8(T03, T13);
+
+            sum0 = _mm_add_pi16(sum0, T20);
+            sum0 = _mm_add_pi16(sum0, T21);
+            sum0 = _mm_add_pi16(sum0, T22);
+            sum0 = _mm_add_pi16(sum0, T23);
+
+            T00 = (*(__m64*)(fenc + (i + 8) * fencstride));
+            T01 = (*(__m64*)(fenc + (i + 9) * fencstride));
+            T02 = (*(__m64*)(fenc + (i + 10) * fencstride));
+            T03 = (*(__m64*)(fenc + (i + 11) * fencstride));
+
+            T10 = (*(__m64*)(fref + (i + 8) * frefstride));
+            T11 = (*(__m64*)(fref + (i + 9) * frefstride));
+            T12 = (*(__m64*)(fref + (i + 10) * frefstride));
+            T13 = (*(__m64*)(fref + (i + 11) * frefstride));
+
+            T20 = _mm_sad_pu8(T00, T10);
+            T21 = _mm_sad_pu8(T01, T11);
+            T22 = _mm_sad_pu8(T02, T12);
+            T23 = _mm_sad_pu8(T03, T13);
+
+            sum0 = _mm_add_pi16(sum0, T20);
+            sum0 = _mm_add_pi16(sum0, T21);
+            sum0 = _mm_add_pi16(sum0, T22);
+            sum0 = _mm_add_pi16(sum0, T23);
+
+            T00 = (*(__m64*)(fenc + (i + 12) * fencstride));
+            T01 = (*(__m64*)(fenc + (i + 13) * fencstride));
+            T02 = (*(__m64*)(fenc + (i + 14) * fencstride));
+            T03 = (*(__m64*)(fenc + (i + 15) * fencstride));
+
+            T10 = (*(__m64*)(fref + (i + 12) * frefstride));
+            T11 = (*(__m64*)(fref + (i + 13) * frefstride));
+            T12 = (*(__m64*)(fref + (i + 14) * frefstride));
+            T13 = (*(__m64*)(fref + (i + 15) * frefstride));
+
+            T20 = _mm_sad_pu8(T00, T10);
+            T21 = _mm_sad_pu8(T01, T11);
+            T22 = _mm_sad_pu8(T02, T12);
+            T23 = _mm_sad_pu8(T03, T13);
+
+            sum0 = _mm_add_pi16(sum0, T20);
+            sum0 = _mm_add_pi16(sum0, T21);
+            sum0 = _mm_add_pi16(sum0, T22);
+            sum0 = _mm_add_pi16(sum0, T23);
+        }
+    }
+    else if ((ly % 8) == 0)
+    {
+        for (int i = 0; i < ly; i += 8)
+        {
+            T00 = (*(__m64*)(fenc + (i + 0) * fencstride));
+            T01 = (*(__m64*)(fenc + (i + 1) * fencstride));
+            T02 = (*(__m64*)(fenc + (i + 2) * fencstride));
+            T03 = (*(__m64*)(fenc + (i + 3) * fencstride));
+
+            T10 = (*(__m64*)(fref + (i + 0) * frefstride));
+            T11 = (*(__m64*)(fref + (i + 1) * frefstride));
+            T12 = (*(__m64*)(fref + (i + 2) * frefstride));
+            T13 = (*(__m64*)(fref + (i + 3) * frefstride));
+
+            T20 = _mm_sad_pu8(T00, T10);
+            T21 = _mm_sad_pu8(T01, T11);
+            T22 = _mm_sad_pu8(T02, T12);
+            T23 = _mm_sad_pu8(T03, T13);
+
+            sum0 = _mm_add_pi16(sum0, T20);
+            sum0 = _mm_add_pi16(sum0, T21);
+            sum0 = _mm_add_pi16(sum0, T22);
+            sum0 = _mm_add_pi16(sum0, T23);
+
+            T00 = (*(__m64*)(fenc + (i + 4) * fencstride));
+            T01 = (*(__m64*)(fenc + (i + 5) * fencstride));
+            T02 = (*(__m64*)(fenc + (i + 6) * fencstride));
+            T03 = (*(__m64*)(fenc + (i + 7) * fencstride));
+
+            T10 = (*(__m64*)(fref + (i + 4) * frefstride));
+            T11 = (*(__m64*)(fref + (i + 5) * frefstride));
+            T12 = (*(__m64*)(fref + (i + 6) * frefstride));
+            T13 = (*(__m64*)(fref + (i + 7) * frefstride));
+
+            T20 = _mm_sad_pu8(T00, T10);
+            T21 = _mm_sad_pu8(T01, T11);
+            T22 = _mm_sad_pu8(T02, T12);
+            T23 = _mm_sad_pu8(T03, T13);
+
+            sum0 = _mm_add_pi16(sum0, T20);
+            sum0 = _mm_add_pi16(sum0, T21);
+            sum0 = _mm_add_pi16(sum0, T22);
+            sum0 = _mm_add_pi16(sum0, T23);
+        }
+    }
+    else
+    {
+        for (int i = 0; i < ly; i += 4)
+        {
+            T00 = (*(__m64*)(fenc + (i + 0) * fencstride));
+            T01 = (*(__m64*)(fenc + (i + 1) * fencstride));
+            T02 = (*(__m64*)(fenc + (i + 2) * fencstride));
+            T03 = (*(__m64*)(fenc + (i + 3) * fencstride));
+
+            T10 = (*(__m64*)(fref + (i + 0) * frefstride));
+            T11 = (*(__m64*)(fref + (i + 1) * frefstride));
+            T12 = (*(__m64*)(fref + (i + 2) * frefstride));
+            T13 = (*(__m64*)(fref + (i + 3) * frefstride));
+
+            T20 = _mm_sad_pu8(T00, T10);
+            T21 = _mm_sad_pu8(T01, T11);
+            T22 = _mm_sad_pu8(T02, T12);
+            T23 = _mm_sad_pu8(T03, T13);
+
+            sum0 = _mm_add_pi16(sum0, T20);
+            sum0 = _mm_add_pi16(sum0, T21);
+            sum0 = _mm_add_pi16(sum0, T22);
+            sum0 = _mm_add_pi16(sum0, T23);
+        }
+    }
+    // 8 * 255 -> 11 bits x 8 -> 14 bits
+    int sum = _m_to_int(sum0);
+    return sum;
+}
+
+#else /* if HAVE_MMX */
+
 template<int ly>
 int sad_8(pixel * fenc, intptr_t fencstride, pixel * fref, intptr_t frefstride)
 {
@@ -660,180 +833,7 @@
     return _mm_cvtsi128_si32(sum0);
 }
 
-#else /* ifdef X86_64 */
-
-template<int ly>
-int sad_8(pixel * fenc, intptr_t fencstride, pixel * fref, intptr_t frefstride)
-{
-    assert((ly % 4) == 0);
-
-    __m64 sum0 = _mm_setzero_si64();
-
-    __m64 T00, T01, T02, T03;
-    __m64 T10, T11, T12, T13;
-    __m64 T20, T21, T22, T23;
-
-    if ((ly % 16) == 0)
-    {
-        for (int i = 0; i < ly; i += 16)
-        {
-            T00 = (*(__m64*)(fenc + (i + 0) * fencstride));
-            T01 = (*(__m64*)(fenc + (i + 1) * fencstride));
-            T02 = (*(__m64*)(fenc + (i + 2) * fencstride));
-            T03 = (*(__m64*)(fenc + (i + 3) * fencstride));
-
-            T10 = (*(__m64*)(fref + (i + 0) * frefstride));
-            T11 = (*(__m64*)(fref + (i + 1) * frefstride));
-            T12 = (*(__m64*)(fref + (i + 2) * frefstride));
-            T13 = (*(__m64*)(fref + (i + 3) * frefstride));
-
-            T20 = _mm_sad_pu8(T00, T10);
-            T21 = _mm_sad_pu8(T01, T11);
-            T22 = _mm_sad_pu8(T02, T12);
-            T23 = _mm_sad_pu8(T03, T13);
-
-            sum0 = _mm_add_pi16(sum0, T20);
-            sum0 = _mm_add_pi16(sum0, T21);
-            sum0 = _mm_add_pi16(sum0, T22);
-            sum0 = _mm_add_pi16(sum0, T23);
-
-            T00 = (*(__m64*)(fenc + (i + 4) * fencstride));
-            T01 = (*(__m64*)(fenc + (i + 5) * fencstride));
-            T02 = (*(__m64*)(fenc + (i + 6) * fencstride));
-            T03 = (*(__m64*)(fenc + (i + 7) * fencstride));
-
-            T10 = (*(__m64*)(fref + (i + 4) * frefstride));
-            T11 = (*(__m64*)(fref + (i + 5) * frefstride));
-            T12 = (*(__m64*)(fref + (i + 6) * frefstride));
-            T13 = (*(__m64*)(fref + (i + 7) * frefstride));
-
-            T20 = _mm_sad_pu8(T00, T10);
-            T21 = _mm_sad_pu8(T01, T11);
-            T22 = _mm_sad_pu8(T02, T12);
-            T23 = _mm_sad_pu8(T03, T13);
-
-            sum0 = _mm_add_pi16(sum0, T20);
-            sum0 = _mm_add_pi16(sum0, T21);
-            sum0 = _mm_add_pi16(sum0, T22);
-            sum0 = _mm_add_pi16(sum0, T23);
-
-            T00 = (*(__m64*)(fenc + (i + 8) * fencstride));
-            T01 = (*(__m64*)(fenc + (i + 9) * fencstride));
-            T02 = (*(__m64*)(fenc + (i + 10) * fencstride));
-            T03 = (*(__m64*)(fenc + (i + 11) * fencstride));
-
-            T10 = (*(__m64*)(fref + (i + 8) * frefstride));
-            T11 = (*(__m64*)(fref + (i + 9) * frefstride));
-            T12 = (*(__m64*)(fref + (i + 10) * frefstride));
-            T13 = (*(__m64*)(fref + (i + 11) * frefstride));
-
-            T20 = _mm_sad_pu8(T00, T10);
-            T21 = _mm_sad_pu8(T01, T11);
-            T22 = _mm_sad_pu8(T02, T12);
-            T23 = _mm_sad_pu8(T03, T13);
-
-            sum0 = _mm_add_pi16(sum0, T20);
-            sum0 = _mm_add_pi16(sum0, T21);
-            sum0 = _mm_add_pi16(sum0, T22);
-            sum0 = _mm_add_pi16(sum0, T23);
-
-            T00 = (*(__m64*)(fenc + (i + 12) * fencstride));
-            T01 = (*(__m64*)(fenc + (i + 13) * fencstride));
-            T02 = (*(__m64*)(fenc + (i + 14) * fencstride));
-            T03 = (*(__m64*)(fenc + (i + 15) * fencstride));
-
-            T10 = (*(__m64*)(fref + (i + 12) * frefstride));
-            T11 = (*(__m64*)(fref + (i + 13) * frefstride));
-            T12 = (*(__m64*)(fref + (i + 14) * frefstride));
-            T13 = (*(__m64*)(fref + (i + 15) * frefstride));
-
-            T20 = _mm_sad_pu8(T00, T10);
-            T21 = _mm_sad_pu8(T01, T11);
-            T22 = _mm_sad_pu8(T02, T12);
-            T23 = _mm_sad_pu8(T03, T13);
-
-            sum0 = _mm_add_pi16(sum0, T20);
-            sum0 = _mm_add_pi16(sum0, T21);
-            sum0 = _mm_add_pi16(sum0, T22);
-            sum0 = _mm_add_pi16(sum0, T23);
-        }
-    }
-    else if ((ly % 8) == 0)
-    {
-        for (int i = 0; i < ly; i += 8)
-        {
-            T00 = (*(__m64*)(fenc + (i + 0) * fencstride));
-            T01 = (*(__m64*)(fenc + (i + 1) * fencstride));
-            T02 = (*(__m64*)(fenc + (i + 2) * fencstride));
-            T03 = (*(__m64*)(fenc + (i + 3) * fencstride));
-
-            T10 = (*(__m64*)(fref + (i + 0) * frefstride));
-            T11 = (*(__m64*)(fref + (i + 1) * frefstride));
-            T12 = (*(__m64*)(fref + (i + 2) * frefstride));
-            T13 = (*(__m64*)(fref + (i + 3) * frefstride));
-
-            T20 = _mm_sad_pu8(T00, T10);
-            T21 = _mm_sad_pu8(T01, T11);
-            T22 = _mm_sad_pu8(T02, T12);
-            T23 = _mm_sad_pu8(T03, T13);
-
-            sum0 = _mm_add_pi16(sum0, T20);
-            sum0 = _mm_add_pi16(sum0, T21);
-            sum0 = _mm_add_pi16(sum0, T22);
-            sum0 = _mm_add_pi16(sum0, T23);
-
-            T00 = (*(__m64*)(fenc + (i + 4) * fencstride));
-            T01 = (*(__m64*)(fenc + (i + 5) * fencstride));
-            T02 = (*(__m64*)(fenc + (i + 6) * fencstride));
-            T03 = (*(__m64*)(fenc + (i + 7) * fencstride));
-
-            T10 = (*(__m64*)(fref + (i + 4) * frefstride));
-            T11 = (*(__m64*)(fref + (i + 5) * frefstride));
-            T12 = (*(__m64*)(fref + (i + 6) * frefstride));
-            T13 = (*(__m64*)(fref + (i + 7) * frefstride));
-
-            T20 = _mm_sad_pu8(T00, T10);
-            T21 = _mm_sad_pu8(T01, T11);
-            T22 = _mm_sad_pu8(T02, T12);
-            T23 = _mm_sad_pu8(T03, T13);
-
-            sum0 = _mm_add_pi16(sum0, T20);
-            sum0 = _mm_add_pi16(sum0, T21);
-            sum0 = _mm_add_pi16(sum0, T22);
-            sum0 = _mm_add_pi16(sum0, T23);
-        }
-    }
-    else
-    {
-        for (int i = 0; i < ly; i += 4)
-        {
-            T00 = (*(__m64*)(fenc + (i + 0) * fencstride));
-            T01 = (*(__m64*)(fenc + (i + 1) * fencstride));
-            T02 = (*(__m64*)(fenc + (i + 2) * fencstride));
-            T03 = (*(__m64*)(fenc + (i + 3) * fencstride));
-
-            T10 = (*(__m64*)(fref + (i + 0) * frefstride));
-            T11 = (*(__m64*)(fref + (i + 1) * frefstride));
-            T12 = (*(__m64*)(fref + (i + 2) * frefstride));
-            T13 = (*(__m64*)(fref + (i + 3) * frefstride));
-
-            T20 = _mm_sad_pu8(T00, T10);
-            T21 = _mm_sad_pu8(T01, T11);
-            T22 = _mm_sad_pu8(T02, T12);
-            T23 = _mm_sad_pu8(T03, T13);
-
-            sum0 = _mm_add_pi16(sum0, T20);
-            sum0 = _mm_add_pi16(sum0, T21);
-            sum0 = _mm_add_pi16(sum0, T22);
-            sum0 = _mm_add_pi16(sum0, T23);
-        }
-    }
-    // 8 * 255 -> 11 bits x 8 -> 14 bits
-    int sum = _m_to_int(sum0);
-    return sum;
-}
-
-#endif /* ifdef X86_64 */
+#endif /* if HAVE_MMX */
 #endif /* SSE41 */
 
 template<int ly>


More information about the x265-devel mailing list