[x265-commits] [x265] pixel: cleared the bug in sse_sp4.
Yuvaraj Venkatesh
yuvaraj at multicorewareinc.com
Tue Oct 15 08:38:46 CEST 2013
details: http://hg.videolan.org/x265/rev/cc35cb2f55e8
branches:
changeset: 4451:cc35cb2f55e8
user: Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
date: Tue Oct 15 11:10:18 2013 +0530
description:
pixel: cleared the bug in sse_sp4.
Subject: [x265] pixel: modified weightUnidir to clear the bug.
details: http://hg.videolan.org/x265/rev/07f03a3fa2b8
branches:
changeset: 4452:07f03a3fa2b8
user: Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
date: Tue Oct 15 11:40:02 2013 +0530
description:
pixel: modified weightUnidir to clear the bug.
Subject: [x265] pixel: cleared the bug in sse_sp8, through sse_sp64
details: http://hg.videolan.org/x265/rev/8c8d5700d22b
branches:
changeset: 4453:8c8d5700d22b
user: Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
date: Tue Oct 15 11:19:44 2013 +0530
description:
pixel: cleared the bug in sse_sp8, through sse_sp64
diffstat:
source/common/vec/pixel-sse41.cpp | 30 +++++++++++++++++++++---------
1 files changed, 21 insertions(+), 9 deletions(-)
diffs (138 lines):
diff -r 764c0e9984f0 -r 8c8d5700d22b source/common/vec/pixel-sse41.cpp
--- a/source/common/vec/pixel-sse41.cpp Tue Oct 15 00:50:51 2013 -0500
+++ b/source/common/vec/pixel-sse41.cpp Tue Oct 15 11:19:44 2013 +0530
@@ -4883,7 +4883,7 @@ int sse_pp_64(pixel* fenc, intptr_t stri
void weightUnidir(short *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)
{
- __m128i w00, roundoff, ofs, fs, tmpsrc, tmpdst, tmp;
+ __m128i w00, roundoff, ofs, fs, tmpsrc, tmpdst, tmp, sign;
int x, y;
w00 = _mm_set1_epi32(w0);
@@ -4895,7 +4895,8 @@ void weightUnidir(short *src, pixel *dst
for (x = 0; x <= width - 4; x += 4)
{
tmpsrc = _mm_loadl_epi64((__m128i*)(src + x));
- tmpsrc = _mm_unpacklo_epi16(tmpsrc, _mm_setzero_si128());
+ sign = _mm_srai_epi16(tmpsrc, 15);
+ tmpsrc = _mm_unpacklo_epi16(tmpsrc, sign);
tmpdst = _mm_add_epi32(_mm_srai_epi32(_mm_add_epi32(_mm_mullo_epi32(w00, _mm_add_epi32(tmpsrc, ofs)), roundoff), shift), fs);
*(uint32_t*)(dst + x) = _mm_cvtsi128_si32(_mm_packus_epi16(_mm_packs_epi32(tmpdst, tmpdst), _mm_setzero_si128()));
}
@@ -4903,7 +4904,8 @@ void weightUnidir(short *src, pixel *dst
if (width > x)
{
tmpsrc = _mm_loadl_epi64((__m128i*)(src + x));
- tmpsrc = _mm_unpacklo_epi16(tmpsrc, _mm_setzero_si128());
+ sign = _mm_srai_epi16(tmpsrc, 15);
+ tmpsrc = _mm_unpacklo_epi16(tmpsrc, sign);
tmpdst = _mm_add_epi32(_mm_srai_epi32(_mm_add_epi32(_mm_mullo_epi32(w00, _mm_add_epi32(tmpsrc, ofs)), roundoff), shift), fs);
tmp = _mm_packus_epi16(_mm_packs_epi32(tmpdst, tmpdst), _mm_setzero_si128());
union
@@ -4977,7 +4979,8 @@ int sse_sp4(short* fenc, intptr_t stride
__m128i T00, T01, T02, T03;
T00 = _mm_loadu_si128((__m128i*)(fenc));
T01 = _mm_cvtsi32_si128(*(uint32_t*)(fref));
- T00 = _mm_unpacklo_epi16(T00, _mm_setzero_si128());
+ __m128i sign = _mm_srai_epi16(T00, 15);
+ T00 = _mm_unpacklo_epi16(T00, sign);
T01 = _mm_unpacklo_epi8(T01, _mm_setzero_si128());
T01 = _mm_unpacklo_epi16(T01, _mm_setzero_si128());
T02 = _mm_sub_epi32(T00, T01);
@@ -4994,12 +4997,13 @@ int sse_sp4(short* fenc, intptr_t stride
}
#define SSE_SP8x1 \
- T10 = _mm_unpacklo_epi16(T00, _mm_setzero_si128()); \
+ sign = _mm_srai_epi16(T00, 15); \
+ T10 = _mm_unpacklo_epi16(T00, sign); \
T11 = _mm_unpacklo_epi16(T02, _mm_setzero_si128()); \
T12 = _mm_sub_epi32(T10, T11); \
T13 = _mm_mullo_epi32(T12, T12); \
sum0 = _mm_add_epi32(sum0, T13); \
- T10 = _mm_unpackhi_epi16(T00, _mm_setzero_si128()); \
+ T10 = _mm_unpackhi_epi16(T00, sign); \
T11 = _mm_unpackhi_epi16(T02, _mm_setzero_si128()); \
T12 = _mm_sub_epi32(T10, T11); \
T13 = _mm_mullo_epi32(T12, T12); \
@@ -5015,6 +5019,7 @@ int sse_sp8(short* fenc, intptr_t stride
{
__m128i T00, T01, T02;
__m128i T10, T11, T12, T13;
+ __m128i sign;
T00 = _mm_loadu_si128((__m128i*)(fenc));
T01 = _mm_loadu_si128((__m128i*)(fref));
@@ -5042,18 +5047,20 @@ int sse_sp12(short* fenc, intptr_t strid
{
__m128i T00, T01;
__m128i T10, T11, T12, T13;
+ __m128i sign;
T00 = _mm_loadu_si128((__m128i*)(fenc));
T01 = _mm_loadu_si128((__m128i*)(fref));
T01 = _mm_srli_si128(_mm_slli_si128(T01, 4), 4); //masking last 4 8-bit integers
- T10 = _mm_unpacklo_epi16(T00, _mm_setzero_si128());
+ sign = _mm_srai_epi16(T00, 15);
+ T10 = _mm_unpacklo_epi16(T00, sign);
T11 = _mm_unpacklo_epi8(T01, _mm_setzero_si128());
T11 = _mm_unpacklo_epi16(T11, _mm_setzero_si128());
T12 = _mm_sub_epi32(T10, T11);
T13 = _mm_mullo_epi32(T12, T12);
sum0 = _mm_add_epi32(sum0, T13);
- T10 = _mm_unpackhi_epi16(T00, _mm_setzero_si128());
+ T10 = _mm_unpackhi_epi16(T00, sign);
T11 = _mm_unpacklo_epi8(T01, _mm_setzero_si128());
T11 = _mm_unpackhi_epi16(T11, _mm_setzero_si128());
T12 = _mm_sub_epi32(T10, T11);
@@ -5062,7 +5069,7 @@ int sse_sp12(short* fenc, intptr_t strid
T00 = _mm_loadu_si128((__m128i*)(fenc + 8));
- T10 = _mm_unpacklo_epi16(T00, _mm_setzero_si128());
+ T10 = _mm_unpacklo_epi16(T00, sign);
T11 = _mm_unpackhi_epi8(T01, _mm_setzero_si128());
T11 = _mm_unpacklo_epi16(T11, _mm_setzero_si128());
T12 = _mm_sub_epi32(T10, T11);
@@ -5088,6 +5095,7 @@ int sse_sp16(short* fenc, intptr_t strid
{
__m128i T00, T01, T02;
__m128i T10, T11, T12, T13;
+ __m128i sign;
T00 = _mm_loadu_si128((__m128i*)(fenc));
T01 = _mm_loadu_si128((__m128i*)(fref));
@@ -5121,6 +5129,7 @@ int sse_sp24(short* fenc, intptr_t strid
{
__m128i T00, T01, T02;
__m128i T10, T11, T12, T13;
+ __m128i sign;
T00 = _mm_loadu_si128((__m128i*)(fenc));
T01 = _mm_loadu_si128((__m128i*)(fref));
@@ -5160,6 +5169,7 @@ int sse_sp32(short* fenc, intptr_t strid
{
__m128i T00, T01, T02;
__m128i T10, T11, T12, T13;
+ __m128i sign;
T00 = _mm_loadu_si128((__m128i*)(fenc));
T01 = _mm_loadu_si128((__m128i*)(fref));
@@ -5204,6 +5214,7 @@ int sse_sp48(short* fenc, intptr_t strid
{
__m128i T00, T01, T02;
__m128i T10, T11, T12, T13;
+ __m128i sign;
T00 = _mm_loadu_si128((__m128i*)(fenc));
T01 = _mm_loadu_si128((__m128i*)(fref));
@@ -5259,6 +5270,7 @@ int sse_sp64(short* fenc, intptr_t strid
{
__m128i T00, T01, T02;
__m128i T10, T11, T12, T13;
+ __m128i sign;
T00 = _mm_loadu_si128((__m128i*)(fenc));
T01 = _mm_loadu_si128((__m128i*)(fref));
More information about the x265-commits
mailing list