[x265] [PATCH] pixel: simplify sad_24 to make easier to maintain
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Mon Oct 7 10:25:48 CEST 2013
# HG changeset patch
# User Dnyaneshwar Gorade <dnyaneshwar at multicorewareinc.com>
# Date 1381134290 -19800
# Mon Oct 07 13:54:50 2013 +0530
# Node ID 8f589218eb2a11858c6bd63f37f0c1bc4f7a429c
# Parent 04e9387f12ad8710baf22ee88063893e208f2c41
pixel: simplify sad_24 to make easier to maintain
diff -r 04e9387f12ad -r 8f589218eb2a source/common/vec/pixel-sse41.cpp
--- a/source/common/vec/pixel-sse41.cpp Mon Oct 07 13:19:38 2013 +0530
+++ b/source/common/vec/pixel-sse41.cpp Mon Oct 07 13:54:50 2013 +0530
@@ -310,102 +310,50 @@
{
__m128i sum0 = _mm_setzero_si128();
__m128i sum1 = _mm_setzero_si128();
+ __m128i T00, T01, T02, T03;
+ __m128i T10, T11, T12, T13;
+ __m128i T20, T21, T22, T23;
+
+#define PROCESS_24x4(BASE) \
+ T00 = _mm_load_si128((__m128i*)(fenc + (BASE + 0) * fencstride)); \
+ T01 = _mm_load_si128((__m128i*)(fenc + (BASE + 1) * fencstride)); \
+ T02 = _mm_load_si128((__m128i*)(fenc + (BASE + 2) * fencstride)); \
+ T03 = _mm_load_si128((__m128i*)(fenc + (BASE + 3) * fencstride)); \
+ T10 = _mm_loadu_si128((__m128i*)(fref + (BASE + 0) * frefstride)); \
+ T11 = _mm_loadu_si128((__m128i*)(fref + (BASE + 1) * frefstride)); \
+ T12 = _mm_loadu_si128((__m128i*)(fref + (BASE + 2) * frefstride)); \
+ T13 = _mm_loadu_si128((__m128i*)(fref + (BASE + 3) * frefstride)); \
+ T20 = _mm_sad_epu8(T00, T10); \
+ T21 = _mm_sad_epu8(T01, T11); \
+ T22 = _mm_sad_epu8(T02, T12); \
+ T23 = _mm_sad_epu8(T03, T13); \
+ sum0 = _mm_add_epi32(sum0, T20); \
+ sum0 = _mm_add_epi32(sum0, T21); \
+ sum0 = _mm_add_epi32(sum0, T22); \
+ sum0 = _mm_add_epi32(sum0, T23); \
+ T00 = _mm_loadl_epi64((__m128i*)(fenc + 16 + ((BASE + 0) * fencstride))); \
+ T01 = _mm_loadl_epi64((__m128i*)(fenc + 16 + ((BASE + 1) * fencstride))); \
+ T01 = _mm_unpacklo_epi64(T00, T01); \
+ T02 = _mm_loadl_epi64((__m128i*)(fenc + 16 + ((BASE + 2) * fencstride))); \
+ T03 = _mm_loadl_epi64((__m128i*)(fenc + 16 + ((BASE + 3) * fencstride))); \
+ T03 = _mm_unpacklo_epi64(T02, T03); \
+ T10 = _mm_loadl_epi64((__m128i*)(fref + 16 + ((BASE + 0) * frefstride))); \
+ T11 = _mm_loadl_epi64((__m128i*)(fref + 16 + ((BASE + 1) * frefstride))); \
+ T11 = _mm_unpacklo_epi64(T10, T11); \
+ T12 = _mm_loadl_epi64((__m128i*)(fref + 16 + ((BASE + 2) * frefstride))); \
+ T13 = _mm_loadl_epi64((__m128i*)(fref + 16 + ((BASE + 3) * frefstride))); \
+ T13 = _mm_unpacklo_epi64(T12, T13); \
+ T20 = _mm_setzero_si128(); \
+ T21 = _mm_setzero_si128(); \
+ T20 = _mm_sad_epu8(T01, T11); \
+ T21 = _mm_sad_epu8(T03, T13); \
+ sum0 = _mm_add_epi32(sum0, T20); \
+ sum0 = _mm_add_epi32(sum0, T21);
for (int i = 0; i < ly; i += 8)
{
- __m128i T00, T01, T02, T03;
- __m128i T10, T11, T12, T13;
- __m128i T20, T21, T22, T23;
-
- T00 = _mm_load_si128((__m128i*)(fenc + (i) * fencstride));
- T01 = _mm_load_si128((__m128i*)(fenc + (i + 1) * fencstride));
- T02 = _mm_load_si128((__m128i*)(fenc + (i + 2) * fencstride));
- T03 = _mm_load_si128((__m128i*)(fenc + (i + 3) * fencstride));
-
- T10 = _mm_loadu_si128((__m128i*)(fref + (i) * frefstride));
- T11 = _mm_loadu_si128((__m128i*)(fref + (i + 1) * frefstride));
- T12 = _mm_loadu_si128((__m128i*)(fref + (i + 2) * frefstride));
- T13 = _mm_loadu_si128((__m128i*)(fref + (i + 3) * frefstride));
-
- T20 = _mm_sad_epu8(T00, T10);
- T21 = _mm_sad_epu8(T01, T11);
- T22 = _mm_sad_epu8(T02, T12);
- T23 = _mm_sad_epu8(T03, T13);
-
- sum0 = _mm_add_epi32(sum0, T20);
- sum0 = _mm_add_epi32(sum0, T21);
- sum0 = _mm_add_epi32(sum0, T22);
- sum0 = _mm_add_epi32(sum0, T23);
-
- T00 = _mm_loadl_epi64((__m128i*)(fenc + ((i) * fencstride) + 16));
- T01 = _mm_loadl_epi64((__m128i*)(fenc + ((i + 1) * fencstride) + 16));
- T01 = _mm_unpacklo_epi64(T00, T01);
-
- T02 = _mm_loadl_epi64((__m128i*)(fenc + ((i + 2) * fencstride) + 16));
- T03 = _mm_loadl_epi64((__m128i*)(fenc + ((i + 3) * fencstride) + 16));
- T03 = _mm_unpacklo_epi64(T02, T03);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref + ((i) * frefstride) + 16));
- T11 = _mm_loadl_epi64((__m128i*)(fref + ((i + 1) * frefstride) + 16));
- T11 = _mm_unpacklo_epi64(T10, T11);
-
- T12 = _mm_loadl_epi64((__m128i*)(fref + ((i + 2) * frefstride) + 16));
- T13 = _mm_loadl_epi64((__m128i*)(fref + ((i + 3) * frefstride) + 16));
- T13 = _mm_unpacklo_epi64(T12, T13);
-
- T20 = _mm_setzero_si128();
- T21 = _mm_setzero_si128();
-
- T20 = _mm_sad_epu8(T01, T11);
- T21 = _mm_sad_epu8(T03, T13);
-
- sum0 = _mm_add_epi32(sum0, T20);
- sum0 = _mm_add_epi32(sum0, T21);
-
- T00 = _mm_load_si128((__m128i*)(fenc + (i + 4) * fencstride));
- T01 = _mm_load_si128((__m128i*)(fenc + (i + 5) * fencstride));
- T02 = _mm_load_si128((__m128i*)(fenc + (i + 6) * fencstride));
- T03 = _mm_load_si128((__m128i*)(fenc + (i + 7) * fencstride));
-
- T10 = _mm_loadu_si128((__m128i*)(fref + (i + 4) * frefstride));
- T11 = _mm_loadu_si128((__m128i*)(fref + (i + 5) * frefstride));
- T12 = _mm_loadu_si128((__m128i*)(fref + (i + 6) * frefstride));
- T13 = _mm_loadu_si128((__m128i*)(fref + (i + 7) * frefstride));
-
- T20 = _mm_sad_epu8(T00, T10);
- T21 = _mm_sad_epu8(T01, T11);
- T22 = _mm_sad_epu8(T02, T12);
- T23 = _mm_sad_epu8(T03, T13);
-
- sum0 = _mm_add_epi32(sum0, T20);
- sum0 = _mm_add_epi32(sum0, T21);
- sum0 = _mm_add_epi32(sum0, T22);
- sum0 = _mm_add_epi32(sum0, T23);
-
- T00 = _mm_loadl_epi64((__m128i*)(fenc + ((i + 4) * fencstride) + 16));
- T01 = _mm_loadl_epi64((__m128i*)(fenc + ((i + 5) * fencstride) + 16));
- T01 = _mm_unpacklo_epi64(T00, T01);
-
- T02 = _mm_loadl_epi64((__m128i*)(fenc + ((i + 6) * fencstride) + 16));
- T03 = _mm_loadl_epi64((__m128i*)(fenc + ((i + 7) * fencstride) + 16));
- T03 = _mm_unpacklo_epi64(T02, T03);
-
- T10 = _mm_loadl_epi64((__m128i*)(fref + ((i + 4) * frefstride) + 16));
- T11 = _mm_loadl_epi64((__m128i*)(fref + ((i + 5) * frefstride) + 16));
- T11 = _mm_unpacklo_epi64(T10, T11);
-
- T12 = _mm_loadl_epi64((__m128i*)(fref + ((i + 6) * frefstride) + 16));
- T13 = _mm_loadl_epi64((__m128i*)(fref + ((i + 7) * frefstride) + 16));
- T13 = _mm_unpacklo_epi64(T12, T13);
-
- T20 = _mm_setzero_si128();
- T21 = _mm_setzero_si128();
-
- T20 = _mm_sad_epu8(T01, T11);
- T21 = _mm_sad_epu8(T03, T13);
-
- sum0 = _mm_add_epi32(sum0, T20);
- sum0 = _mm_add_epi32(sum0, T21);
+ PROCESS_24x4(i);
+ PROCESS_24x4(i+4);
}
sum1 = _mm_shuffle_epi32(sum0, 2);
More information about the x265-devel
mailing list