[x265] [PATCH] pixel8.inc: Enabled MMX code for 32-bit build except VC [added macro and swap postions of sse and MMX funtions]

praveen at multicorewareinc.com praveen at multicorewareinc.com
Mon Aug 26 08:32:47 CEST 2013


# HG changeset patch
# User praveentiwari
# Date 1377498758 -19800
# Node ID 1985c7f63a93166ce172a65682156c89626f8225
# Parent  797c13ec5d2ae7985027f59dcf4d4c5f86c1d367
pixel8.inc: Enabled MMX code for 32-bit build except VC [added macro and swap postions of sse and MMX funtions]

diff -r 797c13ec5d2a -r 1985c7f63a93 source/common/vec/pixel8.inc
--- a/source/common/vec/pixel8.inc	Sun Aug 25 21:53:55 2013 -0500
+++ b/source/common/vec/pixel8.inc	Mon Aug 26 12:02:38 2013 +0530
@@ -35,8 +35,183 @@
 #pragma warning(disable: 4799) // MMX warning EMMS
 #endif
 
+#define HAVE_MMX (!((_MSC_VER) && (X86_64)))
+
 #if INSTRSET >= X265_CPU_LEVEL_SSE41
-#ifdef X86_64
+#if HAVE_MMX
+template<int ly>
+int sad_4(pixel * fenc, intptr_t fencstride, pixel * fref, intptr_t frefstride)
+{
+    assert((ly % 4) == 0);
+
+    __m64 sum0 = _mm_setzero_si64();
+
+    __m64 T00, T01, T02, T03;
+    __m64 T10, T11, T12, T13;
+    __m64 T20, T21, T22, T23;
+
+    if ((ly % 16) == 0)
+    {
+        for (int i = 0; i < ly; i += 16)
+        {
+            T00 = _mm_cvtsi32_si64(*(int*)(fenc + (i + 0) * fencstride));
+            T01 = _mm_cvtsi32_si64(*(int*)(fenc + (i + 1) * fencstride));
+            T02 = _mm_cvtsi32_si64(*(int*)(fenc + (i + 2) * fencstride));
+            T03 = _mm_cvtsi32_si64(*(int*)(fenc + (i + 3) * fencstride));
+
+            T10 = _mm_cvtsi32_si64(*(int*)(fref + (i + 0) * frefstride));
+            T11 = _mm_cvtsi32_si64(*(int*)(fref + (i + 1) * frefstride));
+            T12 = _mm_cvtsi32_si64(*(int*)(fref + (i + 2) * frefstride));
+            T13 = _mm_cvtsi32_si64(*(int*)(fref + (i + 3) * frefstride));
+
+            T20 = _mm_sad_pu8(T00, T10);
+            T21 = _mm_sad_pu8(T01, T11);
+            T22 = _mm_sad_pu8(T02, T12);
+            T23 = _mm_sad_pu8(T03, T13);
+
+            sum0 = _mm_add_pi16(sum0, T20);
+            sum0 = _mm_add_pi16(sum0, T21);
+            sum0 = _mm_add_pi16(sum0, T22);
+            sum0 = _mm_add_pi16(sum0, T23);
+
+            T00 = _mm_cvtsi32_si64(*(int*)(fenc + (i + 4) * fencstride));
+            T01 = _mm_cvtsi32_si64(*(int*)(fenc + (i + 5) * fencstride));
+            T02 = _mm_cvtsi32_si64(*(int*)(fenc + (i + 6) * fencstride));
+            T03 = _mm_cvtsi32_si64(*(int*)(fenc + (i + 7) * fencstride));
+
+            T10 = _mm_cvtsi32_si64(*(int*)(fref + (i + 4) * frefstride));
+            T11 = _mm_cvtsi32_si64(*(int*)(fref + (i + 5) * frefstride));
+            T12 = _mm_cvtsi32_si64(*(int*)(fref + (i + 6) * frefstride));
+            T13 = _mm_cvtsi32_si64(*(int*)(fref + (i + 7) * frefstride));
+
+            T20 = _mm_sad_pu8(T00, T10);
+            T21 = _mm_sad_pu8(T01, T11);
+            T22 = _mm_sad_pu8(T02, T12);
+            T23 = _mm_sad_pu8(T03, T13);
+
+            sum0 = _mm_add_pi16(sum0, T20);
+            sum0 = _mm_add_pi16(sum0, T21);
+            sum0 = _mm_add_pi16(sum0, T22);
+            sum0 = _mm_add_pi16(sum0, T23);
+
+            T00 = _mm_cvtsi32_si64(*(int*)(fenc + (i + 8) * fencstride));
+            T01 = _mm_cvtsi32_si64(*(int*)(fenc + (i + 9) * fencstride));
+            T02 = _mm_cvtsi32_si64(*(int*)(fenc + (i + 10) * fencstride));
+            T03 = _mm_cvtsi32_si64(*(int*)(fenc + (i + 11) * fencstride));
+
+            T10 = _mm_cvtsi32_si64(*(int*)(fref + (i + 8) * frefstride));
+            T11 = _mm_cvtsi32_si64(*(int*)(fref + (i + 9) * frefstride));
+            T12 = _mm_cvtsi32_si64(*(int*)(fref + (i + 10) * frefstride));
+            T13 = _mm_cvtsi32_si64(*(int*)(fref + (i + 11) * frefstride));
+
+            T20 = _mm_sad_pu8(T00, T10);
+            T21 = _mm_sad_pu8(T01, T11);
+            T22 = _mm_sad_pu8(T02, T12);
+            T23 = _mm_sad_pu8(T03, T13);
+
+            sum0 = _mm_add_pi16(sum0, T20);
+            sum0 = _mm_add_pi16(sum0, T21);
+            sum0 = _mm_add_pi16(sum0, T22);
+            sum0 = _mm_add_pi16(sum0, T23);
+
+            T00 = _mm_cvtsi32_si64(*(int*)(fenc + (i + 12) * fencstride));
+            T01 = _mm_cvtsi32_si64(*(int*)(fenc + (i + 13) * fencstride));
+            T02 = _mm_cvtsi32_si64(*(int*)(fenc + (i + 14) * fencstride));
+            T03 = _mm_cvtsi32_si64(*(int*)(fenc + (i + 15) * fencstride));
+
+            T10 = _mm_cvtsi32_si64(*(int*)(fref + (i + 12) * frefstride));
+            T11 = _mm_cvtsi32_si64(*(int*)(fref + (i + 13) * frefstride));
+            T12 = _mm_cvtsi32_si64(*(int*)(fref + (i + 14) * frefstride));
+            T13 = _mm_cvtsi32_si64(*(int*)(fref + (i + 15) * frefstride));
+
+            T20 = _mm_sad_pu8(T00, T10);
+            T21 = _mm_sad_pu8(T01, T11);
+            T22 = _mm_sad_pu8(T02, T12);
+            T23 = _mm_sad_pu8(T03, T13);
+
+            sum0 = _mm_add_pi16(sum0, T20);
+            sum0 = _mm_add_pi16(sum0, T21);
+            sum0 = _mm_add_pi16(sum0, T22);
+            sum0 = _mm_add_pi16(sum0, T23);
+        }
+    }
+    else if ((ly % 8) == 0)
+    {
+        for (int i = 0; i < ly; i += 8)
+        {
+            T00 = _mm_cvtsi32_si64(*(int*)(fenc + (i + 0) * fencstride));
+            T01 = _mm_cvtsi32_si64(*(int*)(fenc + (i + 1) * fencstride));
+            T02 = _mm_cvtsi32_si64(*(int*)(fenc + (i + 2) * fencstride));
+            T03 = _mm_cvtsi32_si64(*(int*)(fenc + (i + 3) * fencstride));
+
+            T10 = _mm_cvtsi32_si64(*(int*)(fref + (i + 0) * frefstride));
+            T11 = _mm_cvtsi32_si64(*(int*)(fref + (i + 1) * frefstride));
+            T12 = _mm_cvtsi32_si64(*(int*)(fref + (i + 2) * frefstride));
+            T13 = _mm_cvtsi32_si64(*(int*)(fref + (i + 3) * frefstride));
+
+            T20 = _mm_sad_pu8(T00, T10);
+            T21 = _mm_sad_pu8(T01, T11);
+            T22 = _mm_sad_pu8(T02, T12);
+            T23 = _mm_sad_pu8(T03, T13);
+
+            sum0 = _mm_add_pi16(sum0, T20);
+            sum0 = _mm_add_pi16(sum0, T21);
+            sum0 = _mm_add_pi16(sum0, T22);
+            sum0 = _mm_add_pi16(sum0, T23);
+
+            T00 = _mm_cvtsi32_si64(*(int*)(fenc + (i + 4) * fencstride));
+            T01 = _mm_cvtsi32_si64(*(int*)(fenc + (i + 5) * fencstride));
+            T02 = _mm_cvtsi32_si64(*(int*)(fenc + (i + 6) * fencstride));
+            T03 = _mm_cvtsi32_si64(*(int*)(fenc + (i + 7) * fencstride));
+
+            T10 = _mm_cvtsi32_si64(*(int*)(fref + (i + 4) * frefstride));
+            T11 = _mm_cvtsi32_si64(*(int*)(fref + (i + 5) * frefstride));
+            T12 = _mm_cvtsi32_si64(*(int*)(fref + (i + 6) * frefstride));
+            T13 = _mm_cvtsi32_si64(*(int*)(fref + (i + 7) * frefstride));
+
+            T20 = _mm_sad_pu8(T00, T10);
+            T21 = _mm_sad_pu8(T01, T11);
+            T22 = _mm_sad_pu8(T02, T12);
+            T23 = _mm_sad_pu8(T03, T13);
+
+            sum0 = _mm_add_pi16(sum0, T20);
+            sum0 = _mm_add_pi16(sum0, T21);
+            sum0 = _mm_add_pi16(sum0, T22);
+            sum0 = _mm_add_pi16(sum0, T23);
+        }
+    }
+    else
+    {
+        for (int i = 0; i < ly; i += 4)
+        {
+            T00 = _mm_cvtsi32_si64(*(int*)(fenc + (i + 0) * fencstride));
+            T01 = _mm_cvtsi32_si64(*(int*)(fenc + (i + 1) * fencstride));
+            T02 = _mm_cvtsi32_si64(*(int*)(fenc + (i + 2) * fencstride));
+            T03 = _mm_cvtsi32_si64(*(int*)(fenc + (i + 3) * fencstride));
+
+            T10 = _mm_cvtsi32_si64(*(int*)(fref + (i + 0) * frefstride));
+            T11 = _mm_cvtsi32_si64(*(int*)(fref + (i + 1) * frefstride));
+            T12 = _mm_cvtsi32_si64(*(int*)(fref + (i + 2) * frefstride));
+            T13 = _mm_cvtsi32_si64(*(int*)(fref + (i + 3) * frefstride));
+
+            T20 = _mm_sad_pu8(T00, T10);
+            T21 = _mm_sad_pu8(T01, T11);
+            T22 = _mm_sad_pu8(T02, T12);
+            T23 = _mm_sad_pu8(T03, T13);
+
+            sum0 = _mm_add_pi16(sum0, T20);
+            sum0 = _mm_add_pi16(sum0, T21);
+            sum0 = _mm_add_pi16(sum0, T22);
+            sum0 = _mm_add_pi16(sum0, T23);
+        }
+    }
+    // 8 * 255 -> 11 bits x 8 -> 14 bits
+    int sum = _m_to_int(sum0);
+    return sum;
+}
+
+#else /* ifdef X86_64 */
+
 template<int ly>
 int sad_4(pixel * fenc, intptr_t fencstride, pixel * fref, intptr_t frefstride)
 {
@@ -259,179 +434,6 @@
     return _mm_extract_epi32(sum0, 0);
 }
 
-#else /* ifdef X86_64 */
-
-template<int ly>
-int sad_4(pixel * fenc, intptr_t fencstride, pixel * fref, intptr_t frefstride)
-{
-    assert((ly % 4) == 0);
-
-    __m64 sum0 = _mm_setzero_si64();
-
-    __m64 T00, T01, T02, T03;
-    __m64 T10, T11, T12, T13;
-    __m64 T20, T21, T22, T23;
-
-    if ((ly % 16) == 0)
-    {
-        for (int i = 0; i < ly; i += 16)
-        {
-            T00 = _mm_cvtsi32_si64(*(int*)(fenc + (i + 0) * fencstride));
-            T01 = _mm_cvtsi32_si64(*(int*)(fenc + (i + 1) * fencstride));
-            T02 = _mm_cvtsi32_si64(*(int*)(fenc + (i + 2) * fencstride));
-            T03 = _mm_cvtsi32_si64(*(int*)(fenc + (i + 3) * fencstride));
-
-            T10 = _mm_cvtsi32_si64(*(int*)(fref + (i + 0) * frefstride));
-            T11 = _mm_cvtsi32_si64(*(int*)(fref + (i + 1) * frefstride));
-            T12 = _mm_cvtsi32_si64(*(int*)(fref + (i + 2) * frefstride));
-            T13 = _mm_cvtsi32_si64(*(int*)(fref + (i + 3) * frefstride));
-
-            T20 = _mm_sad_pu8(T00, T10);
-            T21 = _mm_sad_pu8(T01, T11);
-            T22 = _mm_sad_pu8(T02, T12);
-            T23 = _mm_sad_pu8(T03, T13);
-
-            sum0 = _mm_add_pi16(sum0, T20);
-            sum0 = _mm_add_pi16(sum0, T21);
-            sum0 = _mm_add_pi16(sum0, T22);
-            sum0 = _mm_add_pi16(sum0, T23);
-
-            T00 = _mm_cvtsi32_si64(*(int*)(fenc + (i + 4) * fencstride));
-            T01 = _mm_cvtsi32_si64(*(int*)(fenc + (i + 5) * fencstride));
-            T02 = _mm_cvtsi32_si64(*(int*)(fenc + (i + 6) * fencstride));
-            T03 = _mm_cvtsi32_si64(*(int*)(fenc + (i + 7) * fencstride));
-
-            T10 = _mm_cvtsi32_si64(*(int*)(fref + (i + 4) * frefstride));
-            T11 = _mm_cvtsi32_si64(*(int*)(fref + (i + 5) * frefstride));
-            T12 = _mm_cvtsi32_si64(*(int*)(fref + (i + 6) * frefstride));
-            T13 = _mm_cvtsi32_si64(*(int*)(fref + (i + 7) * frefstride));
-
-            T20 = _mm_sad_pu8(T00, T10);
-            T21 = _mm_sad_pu8(T01, T11);
-            T22 = _mm_sad_pu8(T02, T12);
-            T23 = _mm_sad_pu8(T03, T13);
-
-            sum0 = _mm_add_pi16(sum0, T20);
-            sum0 = _mm_add_pi16(sum0, T21);
-            sum0 = _mm_add_pi16(sum0, T22);
-            sum0 = _mm_add_pi16(sum0, T23);
-
-            T00 = _mm_cvtsi32_si64(*(int*)(fenc + (i + 8) * fencstride));
-            T01 = _mm_cvtsi32_si64(*(int*)(fenc + (i + 9) * fencstride));
-            T02 = _mm_cvtsi32_si64(*(int*)(fenc + (i + 10) * fencstride));
-            T03 = _mm_cvtsi32_si64(*(int*)(fenc + (i + 11) * fencstride));
-
-            T10 = _mm_cvtsi32_si64(*(int*)(fref + (i + 8) * frefstride));
-            T11 = _mm_cvtsi32_si64(*(int*)(fref + (i + 9) * frefstride));
-            T12 = _mm_cvtsi32_si64(*(int*)(fref + (i + 10) * frefstride));
-            T13 = _mm_cvtsi32_si64(*(int*)(fref + (i + 11) * frefstride));
-
-            T20 = _mm_sad_pu8(T00, T10);
-            T21 = _mm_sad_pu8(T01, T11);
-            T22 = _mm_sad_pu8(T02, T12);
-            T23 = _mm_sad_pu8(T03, T13);
-
-            sum0 = _mm_add_pi16(sum0, T20);
-            sum0 = _mm_add_pi16(sum0, T21);
-            sum0 = _mm_add_pi16(sum0, T22);
-            sum0 = _mm_add_pi16(sum0, T23);
-
-            T00 = _mm_cvtsi32_si64(*(int*)(fenc + (i + 12) * fencstride));
-            T01 = _mm_cvtsi32_si64(*(int*)(fenc + (i + 13) * fencstride));
-            T02 = _mm_cvtsi32_si64(*(int*)(fenc + (i + 14) * fencstride));
-            T03 = _mm_cvtsi32_si64(*(int*)(fenc + (i + 15) * fencstride));
-
-            T10 = _mm_cvtsi32_si64(*(int*)(fref + (i + 12) * frefstride));
-            T11 = _mm_cvtsi32_si64(*(int*)(fref + (i + 13) * frefstride));
-            T12 = _mm_cvtsi32_si64(*(int*)(fref + (i + 14) * frefstride));
-            T13 = _mm_cvtsi32_si64(*(int*)(fref + (i + 15) * frefstride));
-
-            T20 = _mm_sad_pu8(T00, T10);
-            T21 = _mm_sad_pu8(T01, T11);
-            T22 = _mm_sad_pu8(T02, T12);
-            T23 = _mm_sad_pu8(T03, T13);
-
-            sum0 = _mm_add_pi16(sum0, T20);
-            sum0 = _mm_add_pi16(sum0, T21);
-            sum0 = _mm_add_pi16(sum0, T22);
-            sum0 = _mm_add_pi16(sum0, T23);
-        }
-    }
-    else if ((ly % 8) == 0)
-    {
-        for (int i = 0; i < ly; i += 8)
-        {
-            T00 = _mm_cvtsi32_si64(*(int*)(fenc + (i + 0) * fencstride));
-            T01 = _mm_cvtsi32_si64(*(int*)(fenc + (i + 1) * fencstride));
-            T02 = _mm_cvtsi32_si64(*(int*)(fenc + (i + 2) * fencstride));
-            T03 = _mm_cvtsi32_si64(*(int*)(fenc + (i + 3) * fencstride));
-
-            T10 = _mm_cvtsi32_si64(*(int*)(fref + (i + 0) * frefstride));
-            T11 = _mm_cvtsi32_si64(*(int*)(fref + (i + 1) * frefstride));
-            T12 = _mm_cvtsi32_si64(*(int*)(fref + (i + 2) * frefstride));
-            T13 = _mm_cvtsi32_si64(*(int*)(fref + (i + 3) * frefstride));
-
-            T20 = _mm_sad_pu8(T00, T10);
-            T21 = _mm_sad_pu8(T01, T11);
-            T22 = _mm_sad_pu8(T02, T12);
-            T23 = _mm_sad_pu8(T03, T13);
-
-            sum0 = _mm_add_pi16(sum0, T20);
-            sum0 = _mm_add_pi16(sum0, T21);
-            sum0 = _mm_add_pi16(sum0, T22);
-            sum0 = _mm_add_pi16(sum0, T23);
-
-            T00 = _mm_cvtsi32_si64(*(int*)(fenc + (i + 4) * fencstride));
-            T01 = _mm_cvtsi32_si64(*(int*)(fenc + (i + 5) * fencstride));
-            T02 = _mm_cvtsi32_si64(*(int*)(fenc + (i + 6) * fencstride));
-            T03 = _mm_cvtsi32_si64(*(int*)(fenc + (i + 7) * fencstride));
-
-            T10 = _mm_cvtsi32_si64(*(int*)(fref + (i + 4) * frefstride));
-            T11 = _mm_cvtsi32_si64(*(int*)(fref + (i + 5) * frefstride));
-            T12 = _mm_cvtsi32_si64(*(int*)(fref + (i + 6) * frefstride));
-            T13 = _mm_cvtsi32_si64(*(int*)(fref + (i + 7) * frefstride));
-
-            T20 = _mm_sad_pu8(T00, T10);
-            T21 = _mm_sad_pu8(T01, T11);
-            T22 = _mm_sad_pu8(T02, T12);
-            T23 = _mm_sad_pu8(T03, T13);
-
-            sum0 = _mm_add_pi16(sum0, T20);
-            sum0 = _mm_add_pi16(sum0, T21);
-            sum0 = _mm_add_pi16(sum0, T22);
-            sum0 = _mm_add_pi16(sum0, T23);
-        }
-    }
-    else
-    {
-        for (int i = 0; i < ly; i += 4)
-        {
-            T00 = _mm_cvtsi32_si64(*(int*)(fenc + (i + 0) * fencstride));
-            T01 = _mm_cvtsi32_si64(*(int*)(fenc + (i + 1) * fencstride));
-            T02 = _mm_cvtsi32_si64(*(int*)(fenc + (i + 2) * fencstride));
-            T03 = _mm_cvtsi32_si64(*(int*)(fenc + (i + 3) * fencstride));
-
-            T10 = _mm_cvtsi32_si64(*(int*)(fref + (i + 0) * frefstride));
-            T11 = _mm_cvtsi32_si64(*(int*)(fref + (i + 1) * frefstride));
-            T12 = _mm_cvtsi32_si64(*(int*)(fref + (i + 2) * frefstride));
-            T13 = _mm_cvtsi32_si64(*(int*)(fref + (i + 3) * frefstride));
-
-            T20 = _mm_sad_pu8(T00, T10);
-            T21 = _mm_sad_pu8(T01, T11);
-            T22 = _mm_sad_pu8(T02, T12);
-            T23 = _mm_sad_pu8(T03, T13);
-
-            sum0 = _mm_add_pi16(sum0, T20);
-            sum0 = _mm_add_pi16(sum0, T21);
-            sum0 = _mm_add_pi16(sum0, T22);
-            sum0 = _mm_add_pi16(sum0, T23);
-        }
-    }
-    // 8 * 255 -> 11 bits x 8 -> 14 bits
-    int sum = _m_to_int(sum0);
-    return sum;
-}
-
 #endif /* ifdef X86_64 */
 
 #ifdef X86_64


More information about the x265-devel mailing list