[x265-commits] [x265] pixel: use unaligned loads for reference pixels in sad_12
Steve Borho
steve at borho.org
Sat Oct 5 23:27:30 CEST 2013
details: http://hg.videolan.org/x265/rev/49231db18e60
branches:
changeset: 4218:49231db18e60
user: Steve Borho <steve at borho.org>
date: Sat Oct 05 16:25:21 2013 -0500
description:
pixel: use unaligned loads for reference pixels in sad_12
diffstat:
source/common/vec/pixel-sse41.cpp | 122 +++++++++++++++++++-------------------
1 files changed, 61 insertions(+), 61 deletions(-)
diffs (204 lines):
diff -r fbfa3a5c5ae8 -r 49231db18e60 source/common/vec/pixel-sse41.cpp
--- a/source/common/vec/pixel-sse41.cpp Sat Oct 05 15:04:28 2013 -0500
+++ b/source/common/vec/pixel-sse41.cpp Sat Oct 05 16:25:21 2013 -0500
@@ -865,13 +865,13 @@ int sad_12(pixel *fenc, intptr_t fencstr
T03 = _mm_load_si128((__m128i*)(fenc + (3) * fencstride));
T03 = _mm_and_si128(T03, mask);
- T10 = _mm_load_si128((__m128i*)(fref));
- T10 = _mm_and_si128(T10, mask);
- T11 = _mm_load_si128((__m128i*)(fref + frefstride));
- T11 = _mm_and_si128(T11, mask);
- T12 = _mm_load_si128((__m128i*)(fref + (2) * frefstride));
- T12 = _mm_and_si128(T12, mask);
- T13 = _mm_load_si128((__m128i*)(fref + (3) * frefstride));
+ T10 = _mm_loadu_si128((__m128i*)(fref));
+ T10 = _mm_and_si128(T10, mask);
+ T11 = _mm_loadu_si128((__m128i*)(fref + frefstride));
+ T11 = _mm_and_si128(T11, mask);
+ T12 = _mm_loadu_si128((__m128i*)(fref + (2) * frefstride));
+ T12 = _mm_and_si128(T12, mask);
+ T13 = _mm_loadu_si128((__m128i*)(fref + (3) * frefstride));
T13 = _mm_and_si128(T13, mask);
T20 = _mm_sad_epu8(T00, T10);
@@ -898,13 +898,13 @@ int sad_12(pixel *fenc, intptr_t fencstr
T03 = _mm_load_si128((__m128i*)(fenc + (3) * fencstride));
T03 = _mm_and_si128(T03, mask);
- T10 = _mm_load_si128((__m128i*)(fref));
- T10 = _mm_and_si128(T10, mask);
- T11 = _mm_load_si128((__m128i*)(fref + frefstride));
- T11 = _mm_and_si128(T11, mask);
- T12 = _mm_load_si128((__m128i*)(fref + (2) * frefstride));
- T12 = _mm_and_si128(T12, mask);
- T13 = _mm_load_si128((__m128i*)(fref + (3) * frefstride));
+ T10 = _mm_loadu_si128((__m128i*)(fref));
+ T10 = _mm_and_si128(T10, mask);
+ T11 = _mm_loadu_si128((__m128i*)(fref + frefstride));
+ T11 = _mm_and_si128(T11, mask);
+ T12 = _mm_loadu_si128((__m128i*)(fref + (2) * frefstride));
+ T12 = _mm_and_si128(T12, mask);
+ T13 = _mm_loadu_si128((__m128i*)(fref + (3) * frefstride));
T13 = _mm_and_si128(T13, mask);
T20 = _mm_sad_epu8(T00, T10);
@@ -925,13 +925,13 @@ int sad_12(pixel *fenc, intptr_t fencstr
T03 = _mm_load_si128((__m128i*)(fenc + (7) * fencstride));
T03 = _mm_and_si128(T03, mask);
- T10 = _mm_load_si128((__m128i*)(fref + (4) * frefstride));
- T10 = _mm_and_si128(T10, mask);
- T11 = _mm_load_si128((__m128i*)(fref + (5) * frefstride));
- T11 = _mm_and_si128(T11, mask);
- T12 = _mm_load_si128((__m128i*)(fref + (6) * frefstride));
- T12 = _mm_and_si128(T12, mask);
- T13 = _mm_load_si128((__m128i*)(fref + (7) * frefstride));
+ T10 = _mm_loadu_si128((__m128i*)(fref + (4) * frefstride));
+ T10 = _mm_and_si128(T10, mask);
+ T11 = _mm_loadu_si128((__m128i*)(fref + (5) * frefstride));
+ T11 = _mm_and_si128(T11, mask);
+ T12 = _mm_loadu_si128((__m128i*)(fref + (6) * frefstride));
+ T12 = _mm_and_si128(T12, mask);
+ T13 = _mm_loadu_si128((__m128i*)(fref + (7) * frefstride));
T13 = _mm_and_si128(T13, mask);
T20 = _mm_sad_epu8(T00, T10);
@@ -959,13 +959,13 @@ int sad_12(pixel *fenc, intptr_t fencstr
T03 = _mm_load_si128((__m128i*)(fenc + (3) * fencstride));
T03 = _mm_and_si128(T03, mask);
- T10 = _mm_load_si128((__m128i*)(fref));
- T10 = _mm_and_si128(T10, mask);
- T11 = _mm_load_si128((__m128i*)(fref + frefstride));
- T11 = _mm_and_si128(T11, mask);
- T12 = _mm_load_si128((__m128i*)(fref + (2) * frefstride));
- T12 = _mm_and_si128(T12, mask);
- T13 = _mm_load_si128((__m128i*)(fref + (3) * frefstride));
+ T10 = _mm_loadu_si128((__m128i*)(fref));
+ T10 = _mm_and_si128(T10, mask);
+ T11 = _mm_loadu_si128((__m128i*)(fref + frefstride));
+ T11 = _mm_and_si128(T11, mask);
+ T12 = _mm_loadu_si128((__m128i*)(fref + (2) * frefstride));
+ T12 = _mm_and_si128(T12, mask);
+ T13 = _mm_loadu_si128((__m128i*)(fref + (3) * frefstride));
T13 = _mm_and_si128(T13, mask);
T20 = _mm_sad_epu8(T00, T10);
@@ -986,13 +986,13 @@ int sad_12(pixel *fenc, intptr_t fencstr
T03 = _mm_load_si128((__m128i*)(fenc + (7) * fencstride));
T03 = _mm_and_si128(T03, mask);
- T10 = _mm_load_si128((__m128i*)(fref + (4) * frefstride));
- T10 = _mm_and_si128(T10, mask);
- T11 = _mm_load_si128((__m128i*)(fref + (5) * frefstride));
- T11 = _mm_and_si128(T11, mask);
- T12 = _mm_load_si128((__m128i*)(fref + (6) * frefstride));
- T12 = _mm_and_si128(T12, mask);
- T13 = _mm_load_si128((__m128i*)(fref + (7) * frefstride));
+ T10 = _mm_loadu_si128((__m128i*)(fref + (4) * frefstride));
+ T10 = _mm_and_si128(T10, mask);
+ T11 = _mm_loadu_si128((__m128i*)(fref + (5) * frefstride));
+ T11 = _mm_and_si128(T11, mask);
+ T12 = _mm_loadu_si128((__m128i*)(fref + (6) * frefstride));
+ T12 = _mm_and_si128(T12, mask);
+ T13 = _mm_loadu_si128((__m128i*)(fref + (7) * frefstride));
T13 = _mm_and_si128(T13, mask);
T20 = _mm_sad_epu8(T00, T10);
@@ -1014,13 +1014,13 @@ int sad_12(pixel *fenc, intptr_t fencstr
T03 = _mm_load_si128((__m128i*)(fenc + (11) * fencstride));
T03 = _mm_and_si128(T03, mask);
- T10 = _mm_load_si128((__m128i*)(fref + (8) * frefstride));
- T10 = _mm_and_si128(T10, mask);
- T11 = _mm_load_si128((__m128i*)(fref + (9) * frefstride));
- T11 = _mm_and_si128(T11, mask);
- T12 = _mm_load_si128((__m128i*)(fref + (10) * frefstride));
- T12 = _mm_and_si128(T12, mask);
- T13 = _mm_load_si128((__m128i*)(fref + (11) * frefstride));
+ T10 = _mm_loadu_si128((__m128i*)(fref + (8) * frefstride));
+ T10 = _mm_and_si128(T10, mask);
+ T11 = _mm_loadu_si128((__m128i*)(fref + (9) * frefstride));
+ T11 = _mm_and_si128(T11, mask);
+ T12 = _mm_loadu_si128((__m128i*)(fref + (10) * frefstride));
+ T12 = _mm_and_si128(T12, mask);
+ T13 = _mm_loadu_si128((__m128i*)(fref + (11) * frefstride));
T13 = _mm_and_si128(T13, mask);
T20 = _mm_sad_epu8(T00, T10);
@@ -1042,13 +1042,13 @@ int sad_12(pixel *fenc, intptr_t fencstr
T03 = _mm_load_si128((__m128i*)(fenc + (15) * fencstride));
T03 = _mm_and_si128(T03, mask);
- T10 = _mm_load_si128((__m128i*)(fref + (12) * frefstride));
- T10 = _mm_and_si128(T10, mask);
- T11 = _mm_load_si128((__m128i*)(fref + (13) * frefstride));
- T11 = _mm_and_si128(T11, mask);
- T12 = _mm_load_si128((__m128i*)(fref + (14) * frefstride));
- T12 = _mm_and_si128(T12, mask);
- T13 = _mm_load_si128((__m128i*)(fref + (15) * frefstride));
+ T10 = _mm_loadu_si128((__m128i*)(fref + (12) * frefstride));
+ T10 = _mm_and_si128(T10, mask);
+ T11 = _mm_loadu_si128((__m128i*)(fref + (13) * frefstride));
+ T11 = _mm_and_si128(T11, mask);
+ T12 = _mm_loadu_si128((__m128i*)(fref + (14) * frefstride));
+ T12 = _mm_and_si128(T12, mask);
+ T13 = _mm_loadu_si128((__m128i*)(fref + (15) * frefstride));
T13 = _mm_and_si128(T13, mask);
T20 = _mm_sad_epu8(T00, T10);
@@ -1078,13 +1078,13 @@ int sad_12(pixel *fenc, intptr_t fencstr
T03 = _mm_load_si128((__m128i*)(fenc + (i + 3) * fencstride));
T03 = _mm_and_si128(T03, mask);
- T10 = _mm_load_si128((__m128i*)(fref + (i) * frefstride));
+ T10 = _mm_loadu_si128((__m128i*)(fref + (i) * frefstride));
T10 = _mm_and_si128(T10, mask);
- T11 = _mm_load_si128((__m128i*)(fref + (i + 1) * frefstride));
+ T11 = _mm_loadu_si128((__m128i*)(fref + (i + 1) * frefstride));
T11 = _mm_and_si128(T11, mask);
- T12 = _mm_load_si128((__m128i*)(fref + (i + 2) * frefstride));
+ T12 = _mm_loadu_si128((__m128i*)(fref + (i + 2) * frefstride));
T12 = _mm_and_si128(T12, mask);
- T13 = _mm_load_si128((__m128i*)(fref + (i + 3) * frefstride));
+ T13 = _mm_loadu_si128((__m128i*)(fref + (i + 3) * frefstride));
T13 = _mm_and_si128(T13, mask);
T20 = _mm_sad_epu8(T00, T10);
@@ -1106,13 +1106,13 @@ int sad_12(pixel *fenc, intptr_t fencstr
T03 = _mm_load_si128((__m128i*)(fenc + (i + 7) * fencstride));
T03 = _mm_and_si128(T03, mask);
- T10 = _mm_load_si128((__m128i*)(fref + (i + 4) * frefstride));
+ T10 = _mm_loadu_si128((__m128i*)(fref + (i + 4) * frefstride));
T10 = _mm_and_si128(T10, mask);
- T11 = _mm_load_si128((__m128i*)(fref + (i + 5) * frefstride));
+ T11 = _mm_loadu_si128((__m128i*)(fref + (i + 5) * frefstride));
T11 = _mm_and_si128(T11, mask);
- T12 = _mm_load_si128((__m128i*)(fref + (i + 6) * frefstride));
+ T12 = _mm_loadu_si128((__m128i*)(fref + (i + 6) * frefstride));
T12 = _mm_and_si128(T12, mask);
- T13 = _mm_load_si128((__m128i*)(fref + (i + 7) * frefstride));
+ T13 = _mm_loadu_si128((__m128i*)(fref + (i + 7) * frefstride));
T13 = _mm_and_si128(T13, mask);
T20 = _mm_sad_epu8(T00, T10);
@@ -1143,13 +1143,13 @@ int sad_12(pixel *fenc, intptr_t fencstr
T03 = _mm_load_si128((__m128i*)(fenc + (i + 3) * fencstride));
T03 = _mm_and_si128(T03, mask);
- T10 = _mm_load_si128((__m128i*)(fref + (i) * frefstride));
+ T10 = _mm_loadu_si128((__m128i*)(fref + (i) * frefstride));
T10 = _mm_and_si128(T10, mask);
- T11 = _mm_load_si128((__m128i*)(fref + (i + 1) * frefstride));
+ T11 = _mm_loadu_si128((__m128i*)(fref + (i + 1) * frefstride));
T11 = _mm_and_si128(T11, mask);
- T12 = _mm_load_si128((__m128i*)(fref + (i + 2) * frefstride));
+ T12 = _mm_loadu_si128((__m128i*)(fref + (i + 2) * frefstride));
T12 = _mm_and_si128(T12, mask);
- T13 = _mm_load_si128((__m128i*)(fref + (i + 3) * frefstride));
+ T13 = _mm_loadu_si128((__m128i*)(fref + (i + 3) * frefstride));
T13 = _mm_and_si128(T13, mask);
T20 = _mm_sad_epu8(T00, T10);
More information about the x265-commits
mailing list