[x265] [PATCH] pixel: simplify sad_12 to make easier to maintain

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Mon Oct 7 09:51:03 CEST 2013


# HG changeset patch
# User Dnyaneshwar Gorade <dnyaneshwar at multicorewareinc.com>
# Date 1381132178 -19800
#      Mon Oct 07 13:19:38 2013 +0530
# Node ID 04e9387f12ad8710baf22ee88063893e208f2c41
# Parent  c010342f7605c86867824f5b525a8f84c0d2de1c
pixel: simplify sad_12 to make easier to maintain

diff -r c010342f7605 -r 04e9387f12ad source/common/vec/pixel-sse41.cpp
--- a/source/common/vec/pixel-sse41.cpp	Sun Oct 06 02:09:00 2013 -0500
+++ b/source/common/vec/pixel-sse41.cpp	Mon Oct 07 13:19:38 2013 +0530
@@ -207,123 +207,42 @@
     assert(ly == 16);
     __m128i sum0 = _mm_setzero_si128();
     __m128i sum1 = _mm_setzero_si128();
-
-    __m128i mask = _mm_set_epi32(0x00000000, 0xffffffff, 0xffffffff, 0xffffffff);
-
     __m128i T00, T01, T02, T03;
     __m128i T10, T11, T12, T13;
     __m128i T20, T21, T22, T23;
 
-    T00 = _mm_load_si128((__m128i*)(fenc));
-    T00 = _mm_and_si128(T00, mask);
-    T01 = _mm_load_si128((__m128i*)(fenc + fencstride));
-    T01 = _mm_and_si128(T01, mask);
-    T02 = _mm_load_si128((__m128i*)(fenc + (2) * fencstride));
-    T02 = _mm_and_si128(T02, mask);
-    T03 = _mm_load_si128((__m128i*)(fenc + (3) * fencstride));
-    T03 = _mm_and_si128(T03, mask);
-
-    T10 = _mm_loadu_si128((__m128i*)(fref));
-    T10 = _mm_and_si128(T10, mask);
-    T11 = _mm_loadu_si128((__m128i*)(fref + frefstride));
-    T11 = _mm_and_si128(T11, mask);
-    T12 = _mm_loadu_si128((__m128i*)(fref + (2) * frefstride));
-    T12 = _mm_and_si128(T12, mask);
-    T13 = _mm_loadu_si128((__m128i*)(fref + (3) * frefstride));
-    T13 = _mm_and_si128(T13, mask);
-
-    T20 = _mm_sad_epu8(T00, T10);
-    T21 = _mm_sad_epu8(T01, T11);
-    T22 = _mm_sad_epu8(T02, T12);
-    T23 = _mm_sad_epu8(T03, T13);
-
-    T20 = _mm_add_epi16(T20, T21);
-    T22 = _mm_add_epi16(T22, T23);
-    sum0 = _mm_add_epi16(T20,T22);
-
-    T00 = _mm_load_si128((__m128i*)(fenc + (4) * fencstride));
-    T00 = _mm_and_si128(T00, mask);
-    T01 = _mm_load_si128((__m128i*)(fenc + (5) * fencstride));
-    T01 = _mm_and_si128(T01, mask);
-    T02 = _mm_load_si128((__m128i*)(fenc + (6) * fencstride));
-    T02 = _mm_and_si128(T02, mask);
-    T03 = _mm_load_si128((__m128i*)(fenc + (7) * fencstride));
-    T03 = _mm_and_si128(T03, mask);
-
-    T10 = _mm_loadu_si128((__m128i*)(fref + (4) * frefstride));
-    T10 = _mm_and_si128(T10, mask);
-    T11 = _mm_loadu_si128((__m128i*)(fref + (5) * frefstride));
-    T11 = _mm_and_si128(T11, mask);
-    T12 = _mm_loadu_si128((__m128i*)(fref + (6) * frefstride));
-    T12 = _mm_and_si128(T12, mask);
-    T13 = _mm_loadu_si128((__m128i*)(fref + (7) * frefstride));
-    T13 = _mm_and_si128(T13, mask);
-
-    T20 = _mm_sad_epu8(T00, T10);
-    T21 = _mm_sad_epu8(T01, T11);
-    T22 = _mm_sad_epu8(T02, T12);
-    T23 = _mm_sad_epu8(T03, T13);
-
-    sum0 = _mm_add_epi16(sum0, T20);
-    sum0 = _mm_add_epi16(sum0, T21);
-    sum0 = _mm_add_epi16(sum0, T22);
-    sum0 = _mm_add_epi16(sum0, T23);
-
-    T00 = _mm_load_si128((__m128i*)(fenc + (8) * fencstride));
-    T00 = _mm_and_si128(T00, mask);
-    T01 = _mm_load_si128((__m128i*)(fenc + (9) * fencstride));
-    T01 = _mm_and_si128(T01, mask);
-    T02 = _mm_load_si128((__m128i*)(fenc + (10) * fencstride));
-    T02 = _mm_and_si128(T02, mask);
-    T03 = _mm_load_si128((__m128i*)(fenc + (11) * fencstride));
-    T03 = _mm_and_si128(T03, mask);
-
-    T10 = _mm_loadu_si128((__m128i*)(fref + (8) * frefstride));
-    T10 = _mm_and_si128(T10, mask);
-    T11 = _mm_loadu_si128((__m128i*)(fref + (9) * frefstride));
-    T11 = _mm_and_si128(T11, mask);
-    T12 = _mm_loadu_si128((__m128i*)(fref + (10) * frefstride));
-    T12 = _mm_and_si128(T12, mask);
-    T13 = _mm_loadu_si128((__m128i*)(fref + (11) * frefstride));
-    T13 = _mm_and_si128(T13, mask);
-
-    T20 = _mm_sad_epu8(T00, T10);
-    T21 = _mm_sad_epu8(T01, T11);
-    T22 = _mm_sad_epu8(T02, T12);
-    T23 = _mm_sad_epu8(T03, T13);
-
-    sum0 = _mm_add_epi16(sum0, T20);
-    sum0 = _mm_add_epi16(sum0, T21);
-    sum0 = _mm_add_epi16(sum0, T22);
-    sum0 = _mm_add_epi16(sum0, T23);
-
-    T00 = _mm_load_si128((__m128i*)(fenc + (12) * fencstride));
-    T00 = _mm_and_si128(T00, mask);
-    T01 = _mm_load_si128((__m128i*)(fenc + (13) * fencstride));
-    T01 = _mm_and_si128(T01, mask);
-    T02 = _mm_load_si128((__m128i*)(fenc + (14) * fencstride));
-    T02 = _mm_and_si128(T02, mask);
-    T03 = _mm_load_si128((__m128i*)(fenc + (15) * fencstride));
-    T03 = _mm_and_si128(T03, mask);
-
-    T10 = _mm_loadu_si128((__m128i*)(fref + (12) * frefstride));
-    T10 = _mm_and_si128(T10, mask);
-    T11 = _mm_loadu_si128((__m128i*)(fref + (13) * frefstride));
-    T11 = _mm_and_si128(T11, mask);
-    T12 = _mm_loadu_si128((__m128i*)(fref + (14) * frefstride));
-    T12 = _mm_and_si128(T12, mask);
-    T13 = _mm_loadu_si128((__m128i*)(fref + (15) * frefstride));
-    T13 = _mm_and_si128(T13, mask);
-
-    T20 = _mm_sad_epu8(T00, T10);
-    T21 = _mm_sad_epu8(T01, T11);
-    T22 = _mm_sad_epu8(T02, T12);
-    T23 = _mm_sad_epu8(T03, T13);
-
-    sum0 = _mm_add_epi16(sum0, T20);
-    sum0 = _mm_add_epi16(sum0, T21);
-    sum0 = _mm_add_epi16(sum0, T22);
-    sum0 = _mm_add_epi16(sum0, T23);
+#define MASK _mm_set_epi32(0x00000000, 0xffffffff, 0xffffffff, 0xffffffff)
+
+#define PROCESS_12x4(BASE)\
+    T00 = _mm_load_si128((__m128i*)(fenc + (BASE + 0) * fencstride)); \
+    T00 = _mm_and_si128(T00, MASK); \
+    T01 = _mm_load_si128((__m128i*)(fenc + (BASE + 1) * fencstride)); \
+    T01 = _mm_and_si128(T01, MASK); \
+    T02 = _mm_load_si128((__m128i*)(fenc + (BASE + 2) * fencstride)); \
+    T02 = _mm_and_si128(T02, MASK); \
+    T03 = _mm_load_si128((__m128i*)(fenc + (BASE + 3) * fencstride)); \
+    T03 = _mm_and_si128(T03, MASK); \
+    T10 = _mm_loadu_si128((__m128i*)(fref + (BASE + 0) * frefstride)); \
+    T10 = _mm_and_si128(T10, MASK); \
+    T11 = _mm_loadu_si128((__m128i*)(fref + (BASE + 1) * frefstride)); \
+    T11 = _mm_and_si128(T11, MASK); \
+    T12 = _mm_loadu_si128((__m128i*)(fref + (BASE + 2) * frefstride)); \
+    T12 = _mm_and_si128(T12, MASK); \
+    T13 = _mm_loadu_si128((__m128i*)(fref + (BASE + 3) * frefstride)); \
+    T13 = _mm_and_si128(T13, MASK); \
+    T20 = _mm_sad_epu8(T00, T10); \
+    T21 = _mm_sad_epu8(T01, T11); \
+    T22 = _mm_sad_epu8(T02, T12); \
+    T23 = _mm_sad_epu8(T03, T13); \
+    sum0 = _mm_add_epi16(sum0, T20); \
+    sum0 = _mm_add_epi16(sum0, T21); \
+    sum0 = _mm_add_epi16(sum0, T22); \
+    sum0 = _mm_add_epi16(sum0, T23)
+
+    PROCESS_12x4(0);
+    PROCESS_12x4(4);
+    PROCESS_12x4(8);
+    PROCESS_12x4(12);
 
     sum1 = _mm_shuffle_epi32(sum0, 2);
     sum0 = _mm_add_epi32(sum0, sum1);


More information about the x265-devel mailing list