[x265] [PATCH] pixel: modified sse_sp8 with a comman macro SSE_SP8x1
yuvaraj at multicorewareinc.com
yuvaraj at multicorewareinc.com
Thu Oct 10 08:28:52 CEST 2013
# HG changeset patch
# User Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
# Date 1381386434 -19800
# Thu Oct 10 11:57:14 2013 +0530
# Node ID a5a07d890f4e01841fc99354c1231f42611a5ea0
# Parent edcc92f2b2abb88c03a9751334fc9a427b8d52f0
pixel: modified sse_sp8 with a comman macro SSE_SP8x1
diff -r edcc92f2b2ab -r a5a07d890f4e source/common/vec/sse.inc
--- a/source/common/vec/sse.inc Wed Oct 09 20:14:29 2013 -0500
+++ b/source/common/vec/sse.inc Thu Oct 10 11:57:14 2013 +0530
@@ -283,6 +283,19 @@
return _mm_cvtsi128_si32(sum);
}
+
+#define SSE_SP8x1 \
+ T10 = _mm_unpacklo_epi16(T00, _mm_setzero_si128()); \
+ T11 = _mm_unpacklo_epi16(T02, _mm_setzero_si128()); \
+ T12 = _mm_sub_epi32(T10, T11); \
+ T13 = _mm_mullo_epi32(T12, T12); \
+ sum0 = _mm_add_epi32(sum0, T13); \
+ T10 = _mm_unpackhi_epi16(T00, _mm_setzero_si128()); \
+ T11 = _mm_unpackhi_epi16(T02, _mm_setzero_si128()); \
+ T12 = _mm_sub_epi32(T10, T11); \
+ T13 = _mm_mullo_epi32(T12, T12); \
+ sum1 = _mm_add_epi32(sum1, T13)
+
template<int ly>
int sse_sp8(short* fenc, intptr_t strideFenc, pixel* fref, intptr_t strideFref)
{
@@ -291,22 +304,14 @@
for(int i = 0; i < ly; i++)
{
- __m128i T00, T01;
+ __m128i T00, T01, T02;
__m128i T10, T11, T12, T13;
+
T00 = _mm_loadu_si128((__m128i*)(fenc));
T01 = _mm_loadu_si128((__m128i*)(fref));
- T01 = _mm_unpacklo_epi8(T01, _mm_setzero_si128()); //convert 8-bit to 16-bit
+ T02 = _mm_unpacklo_epi8(T01, _mm_setzero_si128());
- T10 = _mm_unpacklo_epi16(T00, _mm_setzero_si128());
- T11 = _mm_unpacklo_epi16(T01, _mm_setzero_si128());
- T12 = _mm_sub_epi32(T10, T11);
- T13 = _mm_mullo_epi32(T12, T12);
- sum0 = _mm_add_epi32(sum0, T13);
- T10 = _mm_unpackhi_epi16(T00, _mm_setzero_si128());
- T11 = _mm_unpackhi_epi16(T01, _mm_setzero_si128());
- T12 = _mm_sub_epi32(T10, T11);
- T13 = _mm_mullo_epi32(T12, T12);
- sum1 = _mm_add_epi32(sum1, T13);
+ SSE_SP8x1;
fenc += strideFenc;
fref += strideFref;
More information about the x265-devel
mailing list