[x265] [PATCH] dct: replace dequant vector class function with intrinsic
yuvaraj at multicorewareinc.com
yuvaraj at multicorewareinc.com
Thu Oct 10 13:28:57 CEST 2013
# HG changeset patch
# User Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
# Date 1381404467 -19800
# Thu Oct 10 16:57:47 2013 +0530
# Node ID 840229ed3794569f5e15d84289531c829b75dcd6
# Parent a79ecf3a787577a2e557659c7a8d226d7d41ce00
dct: replace dequant vector class function with intrinsic
diff -r a79ecf3a7875 -r 840229ed3794 source/common/vec/dct-sse3.cpp
--- a/source/common/vec/dct-sse3.cpp Thu Oct 10 12:29:41 2013 +0530
+++ b/source/common/vec/dct-sse3.cpp Thu Oct 10 16:57:47 2013 +0530
@@ -62,68 +62,68 @@
if (shift > per)
{
valueToAdd = 1 << (shift - per - 1);
- Vec4i IAdd(valueToAdd);
+ __m128i IAdd = _mm_set1_epi32(valueToAdd);
for (int n = 0; n < width * height; n = n + 8)
{
- Vec4i quantCoef1, quantCoef2, deQuantCoef1, deQuantCoef2;
+ __m128i quantCoef1, quantCoef2, deQuantCoef1, deQuantCoef2, quantCoef12, sign;
- quantCoef1.load(quantCoef + n);
- quantCoef2.load(quantCoef + n + 4);
+ quantCoef1 = _mm_loadu_si128((__m128i*)(quantCoef + n));
+ quantCoef2 = _mm_loadu_si128((__m128i*)(quantCoef + n + 4));
- deQuantCoef1.load(deQuantCoef + n);
- deQuantCoef2.load(deQuantCoef + n + 4);
+ deQuantCoef1 = _mm_loadu_si128((__m128i*)(deQuantCoef + n));
+ deQuantCoef2 = _mm_loadu_si128((__m128i*)(deQuantCoef + n + 4));
- Vec8s quantCoef12 = compress_saturated(quantCoef1, quantCoef2);
+ quantCoef12 = _mm_packs_epi32(quantCoef1, quantCoef2);
+ sign = _mm_srai_epi16(quantCoef12, 15);
+ quantCoef1 = _mm_unpacklo_epi16(quantCoef12, sign);
+ quantCoef2 = _mm_unpackhi_epi16(quantCoef12, sign);
- quantCoef1 = extend_low(quantCoef12);
- quantCoef2 = extend_high(quantCoef12);
+ quantCoef1 = _mm_sra_epi32(_mm_add_epi32(_mm_mullo_epi32(quantCoef1, deQuantCoef1), IAdd), _mm_cvtsi32_si128(shift - per));
+ quantCoef2 = _mm_sra_epi32(_mm_add_epi32(_mm_mullo_epi32(quantCoef2, deQuantCoef2), IAdd), _mm_cvtsi32_si128(shift - per));
- quantCoef1 = (quantCoef1 * deQuantCoef1 + IAdd) >> (shift - per);
- quantCoef2 = (quantCoef2 * deQuantCoef2 + IAdd) >> (shift - per);
-
- quantCoef12 = compress_saturated(quantCoef1, quantCoef2);
-
- quantCoef1 = extend_low(quantCoef12);
- quantCoef1.store(coef + n);
- quantCoef2 = extend_high(quantCoef12);
- quantCoef2.store(coef + n + 4);
+ quantCoef12 = _mm_packs_epi32(quantCoef1, quantCoef2);
+ sign = _mm_srai_epi16(quantCoef12, 15);
+ quantCoef1 = _mm_unpacklo_epi16(quantCoef12, sign);
+ _mm_storeu_si128((__m128i*)(coef + n), quantCoef1);
+ quantCoef2 = _mm_unpackhi_epi16(quantCoef12, sign);
+ _mm_storeu_si128((__m128i*)(coef + n + 4), quantCoef2);
}
}
else
{
for (int n = 0; n < width * height; n = n + 8)
{
- Vec4i quantCoef1, quantCoef2, deQuantCoef1, deQuantCoef2;
+ __m128i quantCoef1, quantCoef2, deQuantCoef1, deQuantCoef2, quantCoef12, sign;
- quantCoef1.load(quantCoef + n);
- quantCoef2.load(quantCoef + n + 4);
+ quantCoef1 = _mm_loadu_si128((__m128i*)(quantCoef + n));
+ quantCoef2 = _mm_loadu_si128((__m128i*)(quantCoef + n + 4));
- deQuantCoef1.load(deQuantCoef + n);
- deQuantCoef2.load(deQuantCoef + n + 4);
+ deQuantCoef1 = _mm_loadu_si128((__m128i*)(deQuantCoef + n));
+ deQuantCoef2 = _mm_loadu_si128((__m128i*)(deQuantCoef + n + 4));
- Vec8s quantCoef12 = compress_saturated(quantCoef1, quantCoef2);
+ quantCoef12 = _mm_packs_epi32(quantCoef1, quantCoef2);
+ sign = _mm_srai_epi16(quantCoef12, 15);
+ quantCoef1 = _mm_unpacklo_epi16(quantCoef12, sign);
+ quantCoef2 = _mm_unpackhi_epi16(quantCoef12, sign);
- quantCoef1 = extend_low(quantCoef12);
- quantCoef2 = extend_high(quantCoef12);
+ quantCoef1 = _mm_mullo_epi32(quantCoef1, deQuantCoef1);
+ quantCoef2 = _mm_mullo_epi32(quantCoef2, deQuantCoef2);
- quantCoef1 = quantCoef1 * deQuantCoef1;
- quantCoef2 = quantCoef2 * deQuantCoef2;
+ quantCoef12 = _mm_packs_epi32(quantCoef1, quantCoef2);
+ sign = _mm_srai_epi16(quantCoef12, 15);
+ quantCoef1 = _mm_unpacklo_epi16(quantCoef12, sign);
+ quantCoef2 = _mm_unpackhi_epi16(quantCoef12, sign);
- quantCoef12 = compress_saturated(quantCoef1, quantCoef2);
+ quantCoef1 = _mm_sll_epi32(quantCoef1, _mm_cvtsi32_si128(per - shift));
+ quantCoef2 = _mm_sll_epi32(quantCoef2, _mm_cvtsi32_si128(per - shift));
- quantCoef1 = extend_low(quantCoef12);
- quantCoef2 = extend_high(quantCoef12);
-
- quantCoef1 = quantCoef1 << (per - shift);
- quantCoef2 = quantCoef2 << (per - shift);
-
- quantCoef12 = compress_saturated(quantCoef1, quantCoef2);
-
- quantCoef1 = extend_low(quantCoef12);
- quantCoef1.store(coef + n);
- quantCoef2 = extend_high(quantCoef12);
- quantCoef2.store(coef + n + 4);
+ quantCoef12 = _mm_packs_epi32(quantCoef1, quantCoef2);
+ sign = _mm_srai_epi16(quantCoef12, 15);
+ quantCoef1 = _mm_unpacklo_epi16(quantCoef12, sign);
+ _mm_storeu_si128((__m128i*)(coef + n), quantCoef1);
+ quantCoef2 = _mm_unpackhi_epi16(quantCoef12, sign);
+ _mm_storeu_si128((__m128i*)(coef + n + 4), quantCoef2);
}
}
}
@@ -132,29 +132,30 @@
valueToAdd = 1 << (shift - 1);
int scale = invQuantScales[rem] << per;
- Vec4i vScale(scale);
- Vec4i vAdd(valueToAdd);
+ __m128i vScale = _mm_set1_epi32(scale);
+ __m128i vAdd = _mm_set1_epi32(valueToAdd);
for (int n = 0; n < width * height; n = n + 8)
{
- Vec4i quantCoef1, quantCoef2;
- quantCoef1.load(quantCoef + n);
- quantCoef2.load(quantCoef + n + 4);
+ __m128i quantCoef1, quantCoef2, quantCoef12, sign;
- Vec8s quantCoef12 = compress_saturated(quantCoef1, quantCoef2);
+ quantCoef1 = _mm_loadu_si128((__m128i*)(quantCoef + n));
+ quantCoef2 = _mm_loadu_si128((__m128i*)(quantCoef + n + 4));
- quantCoef1 = extend_low(quantCoef12);
- quantCoef2 = extend_high(quantCoef12);
+ quantCoef12 = _mm_packs_epi32(quantCoef1, quantCoef2);
+ sign = _mm_srai_epi16(quantCoef12, 15);
+ quantCoef1 = _mm_unpacklo_epi16(quantCoef12, sign);
+ quantCoef2 = _mm_unpackhi_epi16(quantCoef12, sign);
- quantCoef1 = (quantCoef1 * vScale + vAdd) >> shift;
- quantCoef2 = (quantCoef2 * vScale + vAdd) >> shift;
+ quantCoef1 = _mm_sra_epi32(_mm_add_epi32(_mm_mullo_epi32(quantCoef1, vScale), vAdd), _mm_cvtsi32_si128(shift));
+ quantCoef2 = _mm_sra_epi32(_mm_add_epi32(_mm_mullo_epi32(quantCoef2, vScale), vAdd), _mm_cvtsi32_si128(shift));
- quantCoef12 = compress_saturated(quantCoef1, quantCoef2);
-
- quantCoef1 = extend_low(quantCoef12);
- quantCoef1.store(coef + n);
- quantCoef2 = extend_high(quantCoef12);
- quantCoef2.store(coef + n + 4);
+ quantCoef12 = _mm_packs_epi32(quantCoef1, quantCoef2);
+ sign = _mm_srai_epi16(quantCoef12, 15);
+ quantCoef1 = _mm_unpacklo_epi16(quantCoef12, sign);
+ _mm_storeu_si128((__m128i*)(coef + n), quantCoef1);
+ quantCoef2 = _mm_unpackhi_epi16(quantCoef12, sign);
+ _mm_storeu_si128((__m128i*)(coef + n + 4), quantCoef2);
}
}
}
More information about the x265-devel
mailing list