[x265] [PATCH] sse_pp12: Replced costly psrli + pmovzx and psrli + add
praveen at multicorewareinc.com
praveen at multicorewareinc.com
Thu Aug 8 13:12:19 CEST 2013
# HG changeset patch
# User praveentiwari
# Date 1375960331 -19800
# Node ID f86aa153a61612c98dce25321818148c0715bdd6
# Parent e2684493138f138c0f1c14bb01d41152bad3878e
sse_pp12: Replced costly psrli + pmovzx and psrli + add
diff -r e2684493138f -r f86aa153a616 source/common/vec/sse.inc
--- a/source/common/vec/sse.inc Thu Aug 08 16:11:26 2013 +0530
+++ b/source/common/vec/sse.inc Thu Aug 08 16:42:11 2013 +0530
@@ -76,36 +76,34 @@
int sse_pp12(pixel* Org, intptr_t strideOrg, pixel* Cur, intptr_t strideCur)
{
int rows = ly;
- __m128i sum = _mm_set1_epi32(0);
+ __m128i sum = _mm_setzero_si128();
+ __m128i zero = _mm_setzero_si128();
for (; rows != 0; rows--)
{
__m128i m1 = _mm_loadu_si128((__m128i const*)(Org));
__m128i n1 = _mm_loadu_si128((__m128i const*)(Cur));
- m1 = _mm_srli_si128(_mm_slli_si128(m1, 4), 4);
- n1 = _mm_srli_si128(_mm_slli_si128(n1, 4), 4);
-
__m128i m1lo = _mm_cvtepu8_epi16(m1);
- __m128i m1hi = _mm_srli_si128(m1, 8);
- m1hi = _mm_cvtepu8_epi16(m1hi);
+ __m128i m1hi = _mm_unpackhi_epi8(m1, zero);
__m128i n1lo = _mm_cvtepu8_epi16(n1);
- __m128i n1hi = _mm_srli_si128(n1, 8);
- n1hi = _mm_cvtepu8_epi16(n1hi);
+ __m128i n1hi = _mm_unpackhi_epi8(n1, zero);
- __m128i diff = _mm_sub_epi16(m1lo, n1lo);
- sum = _mm_add_epi32(sum, _mm_madd_epi16(diff, diff));
+ __m128i difflo = _mm_sub_epi16(m1lo, n1lo);
+ sum = _mm_add_epi32(sum, _mm_madd_epi16(difflo, difflo));
- diff = _mm_sub_epi16(m1hi, n1hi);
- sum = _mm_add_epi32(sum, _mm_madd_epi16(diff, diff));
+ __m128i diffhi = _mm_sub_epi16(m1hi, n1hi);
+ __m128i sum_temp = _mm_madd_epi16(diffhi, diffhi);
+
+ sum = _mm_add_epi32(sum, _mm_srli_si128(_mm_slli_si128(sum_temp, 8), 8));
Org += strideOrg;
Cur += strideCur;
}
- sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
- sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4));
+ sum = _mm_hadd_epi32(sum, sum);
+ sum = _mm_hadd_epi32(sum, sum);
return _mm_cvtsi128_si32(sum);
}
More information about the x265-devel
mailing list