[x265] [PATCH] Vertical filter with Border extend
deepthidevaki at multicorewareinc.com
deepthidevaki at multicorewareinc.com
Thu Jun 20 10:49:49 CEST 2013
# HG changeset patch
# User Deepthi Devaki
# Date 1371716776 -19800
# Node ID 2687975a2ccb545f4fb8cd3061f06b85f0f54e71
# Parent 0e1117a1e892249838114f6c91caad7b0f206871
Vertical filter with Border extend
diff -r 0e1117a1e892 -r 2687975a2ccb source/common/ipfilter.cpp
--- a/source/common/ipfilter.cpp Tue Jun 18 14:50:03 2013 +0530
+++ b/source/common/ipfilter.cpp Thu Jun 20 13:56:16 2013 +0530
@@ -310,6 +310,46 @@
filterVertical_short_pel<8>(bitDepth, src, srcStride, dstP, dstStride, block_width, block_height, TComInterpolationFilter::m_lumaFilter[3]);
}
+void xExtendPicCompBorder(pixel* piTxt, int iStride, int iWidth, int iHeight, int iMarginX, int iMarginY)
+{
+ int x, y;
+ pixel* pi;
+
+ pi = piTxt;
+ for (y = 0; y < iHeight; y++)
+ {
+ for (x = 0; x < iMarginX; x++)
+ {
+ pi[-iMarginX + x] = pi[0];
+ pi[iWidth + x] = pi[iWidth - 1];
+ }
+
+ pi += iStride;
+ }
+
+ pi -= (iStride + iMarginX);
+ for (y = 0; y < iMarginY; y++)
+ {
+ ::memcpy(pi + (y + 1) * iStride, pi, sizeof(pixel) * (iWidth + (iMarginX << 1)));
+ }
+
+ pi -= ((iHeight - 1) * iStride);
+ for (y = 0; y < iMarginY; y++)
+ {
+ ::memcpy(pi - (y + 1) * iStride, pi, sizeof(pixel) * (iWidth + (iMarginX << 1)));
+ }
+}
+
+void CDECL filterVerticalMultiplaneExtend(int bitDepth, short *src, int srcStride, pixel *dstE, pixel *dstI, pixel *dstP, int dstStride, int block_width, int block_height, int marginX, int marginY)
+{
+ filterVertical_short_pel<8>(bitDepth, src, srcStride, dstI, dstStride, block_width, block_height, TComInterpolationFilter::m_lumaFilter[2]);
+ filterVertical_short_pel<8>(bitDepth, src, srcStride, dstE, dstStride, block_width, block_height, TComInterpolationFilter::m_lumaFilter[1]);
+ filterVertical_short_pel<8>(bitDepth, src, srcStride, dstP, dstStride, block_width, block_height, TComInterpolationFilter::m_lumaFilter[3]);
+ xExtendPicCompBorder(dstE, dstStride, block_width, block_height, marginX, marginY);
+ xExtendPicCompBorder(dstI, dstStride, block_width, block_height, marginX, marginY);
+ xExtendPicCompBorder(dstP, dstStride, block_width, block_height, marginX, marginY);
+}
+
void CDECL filterHorizontalMultiplane(int bitDepth, pixel *src, int srcStride, short *midF, short* midA, short* midB, short* midC, int midStride, pixel *pDstA, pixel *pDstB, pixel *pDstC, int pDstStride, int block_width, int block_height)
{
filterConvertPelToShort(bitDepth, src, srcStride, midF, midStride, block_width, block_height);
@@ -344,7 +384,7 @@
p.ipFilter_p_p[FILTER_V_P_P_8] = filterVertical_pel_pel<8>;
p.ipFilter_p_p[FILTER_V_P_P_4] = filterVertical_pel_pel<4>;
- p.filterVmulti = filterVertical_short_pel_multiplane;
+ p.filterVmulti = filterVerticalMultiplaneExtend;
p.filterHmulti = filterHorizontalMultiplane;
}
}
diff -r 0e1117a1e892 -r 2687975a2ccb source/common/primitives.h
--- a/source/common/primitives.h Tue Jun 18 14:50:03 2013 +0530
+++ b/source/common/primitives.h Thu Jun 20 13:56:16 2013 +0530
@@ -205,7 +205,7 @@
typedef void (CDECL * idct_t)(int *pSrc, short *pDst, intptr_t stride);
typedef void (CDECL * calcresidual_t)(pixel *piOrig, pixel *piPred, short *piRes, int stride);
typedef void (CDECL * calcrecon_t)(pixel* piPred, short* piResi,pixel* piReco, short* piRecQt, pixel *piRecIPred, int uiStride, int uiRecQtStride, int uiRecIPredStride);
-typedef void (CDECL * filterVmulti_t)(int bitDepth, short *src, int srcStride, pixel *dstE, pixel *dstI, pixel *dstP, int dstStride, int block_width, int block_height);
+typedef void (CDECL * filterVmulti_t)(int bitDepth, short *src, int srcStride, pixel *dstE, pixel *dstI, pixel *dstP, int dstStride, int block_width, int block_height,int marginX, int marginY);
typedef void (CDECL * filterHmulti_t)(int bitDepth, pixel *src, int srcStride, short *midF, short* midA, short* midB, short* midC, int midStride, pixel *pDstA, pixel *pDstB, pixel *pDstC, int pDstStride, int block_width, int block_height);
/* Define a structure containing function pointers to optimized encoder
diff -r 0e1117a1e892 -r 2687975a2ccb source/common/reference.cpp
--- a/source/common/reference.cpp Tue Jun 18 14:50:03 2013 +0530
+++ b/source/common/reference.cpp Thu Jun 20 13:56:16 2013 +0530
@@ -157,13 +157,13 @@
pixel *dstPtr2 = m_lumaPlane[x][2] - s_tmpMarginY * m_lumaStride - s_tmpMarginX;
pixel *dstPtr3 = m_lumaPlane[x][3] - s_tmpMarginY * m_lumaStride - s_tmpMarginX;
- primitives.filterVmulti(g_bitDepthY, intPtr, m_intStride, dstPtr1, dstPtr2, dstPtr3, m_lumaStride, m_filterWidth, m_filterHeight);
+ primitives.filterVmulti(g_bitDepthY, intPtr, m_intStride, dstPtr1, dstPtr2, dstPtr3, m_lumaStride, m_filterWidth, m_filterHeight, m_reconPic->m_iLumaMarginX - s_tmpMarginX, m_reconPic->m_iLumaMarginY - s_tmpMarginY);
if (x > 0)
{
int bufOffset = -(s_tmpMarginY + s_intMarginY) * m_lumaStride - (s_tmpMarginX + s_intMarginX);
m_reconPic->xExtendPicCompBorder(m_lumaPlane[x][0] + bufOffset, m_lumaStride, m_filterWidth + (2 * s_intMarginX), m_filterHeight + (2 * s_intMarginY), m_reconPic->m_iLumaMarginX - s_tmpMarginX - s_intMarginX, m_reconPic->m_iLumaMarginY - s_tmpMarginY - s_intMarginY);
}
- m_reconPic->xExtendPicCompBorder(dstPtr1, m_lumaStride, m_filterWidth, m_filterHeight, m_reconPic->m_iLumaMarginX - s_tmpMarginX, m_reconPic->m_iLumaMarginY - s_tmpMarginY);
+ /*m_reconPic->xExtendPicCompBorder(dstPtr1, m_lumaStride, m_filterWidth, m_filterHeight, m_reconPic->m_iLumaMarginX - s_tmpMarginX, m_reconPic->m_iLumaMarginY - s_tmpMarginY);
m_reconPic->xExtendPicCompBorder(dstPtr2, m_lumaStride, m_filterWidth, m_filterHeight, m_reconPic->m_iLumaMarginX - s_tmpMarginX, m_reconPic->m_iLumaMarginY - s_tmpMarginY);
- m_reconPic->xExtendPicCompBorder(dstPtr3, m_lumaStride, m_filterWidth, m_filterHeight, m_reconPic->m_iLumaMarginX - s_tmpMarginX, m_reconPic->m_iLumaMarginY - s_tmpMarginY);
+ m_reconPic->xExtendPicCompBorder(dstPtr3, m_lumaStride, m_filterWidth, m_filterHeight, m_reconPic->m_iLumaMarginX - s_tmpMarginX, m_reconPic->m_iLumaMarginY - s_tmpMarginY);*/
}
diff -r 0e1117a1e892 -r 2687975a2ccb source/common/vec/ipfilter.inc
--- a/source/common/vec/ipfilter.inc Tue Jun 18 14:50:03 2013 +0530
+++ b/source/common/vec/ipfilter.inc Thu Jun 20 13:56:16 2013 +0530
@@ -41,8 +41,8 @@
p.ipFilter_p_p[FILTER_V_P_P_4] = filterVertical_pel_pel<4>;
#if !HIGH_BIT_DEPTH
- p.filterVmulti = filterVertical_short_pel_multiplane;
- p.filterHmulti = filterHorizontalMultiplane;
-#endif
+ p.filterVmulti = filterVerticalMultiplaneExtend;
+ p.filterHmulti = filterHorizontalMultiplane;
+#endif
}
}
diff -r 0e1117a1e892 -r 2687975a2ccb source/common/vec/ipfilter8.inc
--- a/source/common/vec/ipfilter8.inc Tue Jun 18 14:50:03 2013 +0530
+++ b/source/common/vec/ipfilter8.inc Thu Jun 20 13:56:16 2013 +0530
@@ -260,15 +260,18 @@
sumi = _mm_sra_epi32(_mm_add_epi32(sumi, _mm_set1_epi32(offset)), _mm_cvtsi32_si128(12)); \
CLIP0(sumi) \
tmp = _mm_packs_epi32(sumi, _mm_setzero_si128()); \
- *(uint32_t*)(dstI + row * dstStride + col) = _mm_cvtsi128_si32(_mm_packus_epi16(tmp, _mm_setzero_si128())); \
+ sumi = _mm_packus_epi16(tmp, _mm_setzero_si128()); \
+ *(uint32_t*)(dstI + row * dstStride + col) = _mm_cvtsi128_si32(sumi); \
sume = _mm_sra_epi32(_mm_add_epi32(sume, _mm_set1_epi32(offset)), _mm_cvtsi32_si128(12)); \
CLIP0(sume) \
tmp = _mm_packs_epi32(sume, _mm_setzero_si128()); \
- *(uint32_t*)(dstE + row * dstStride + col) = _mm_cvtsi128_si32(_mm_packus_epi16(tmp, _mm_setzero_si128())); \
+ sume = _mm_packus_epi16(tmp, _mm_setzero_si128()); \
+ *(uint32_t*)(dstE + row * dstStride + col) = _mm_cvtsi128_si32(sume); \
sump = _mm_sra_epi32(_mm_add_epi32(sump, _mm_set1_epi32(offset)), _mm_cvtsi32_si128(12)); \
CLIP0(sump) \
tmp = _mm_packs_epi32(sump, _mm_setzero_si128()); \
- *(uint32_t*)(dstP + row * dstStride + col) = _mm_cvtsi128_si32(_mm_packus_epi16(tmp, _mm_setzero_si128())); \
+ sump = _mm_packus_epi16(tmp, _mm_setzero_si128()); \
+ *(uint32_t*)(dstP + row * dstStride + col) = _mm_cvtsi128_si32(sump); \
}
#else /* if INSTRSET >= 5 */
#define PROCESSROW(a0, a1, a2, a3, a4, a5, a6, a7) { \
@@ -295,22 +298,57 @@
sumi = exp1 - a2 + exp2 + exp5 + exp6 - a5; \
sump = a1 - 5 * a2 + exp3 + exp4 + exp5 + a4 + exp6; \
/* store results */ \
- sumi = (sumi + offset) >> 12; \
+ sumi = _mm_sra_epi32(_mm_add_epi32(sumi, _mm_set1_epi32(offset)), _mm_cvtsi32_si128(12)); \
CLIP0(sumi) \
tmp = _mm_packs_epi32(sumi, _mm_setzero_si128()); \
- *(uint32_t*)(dstI + row * dstStride + col) = _mm_cvtsi128_si32(_mm_packus_epi16(tmp, _mm_setzero_si128())); \
- sume = (sume + offset) >> 12; \
+ sumi = _mm_packus_epi16(tmp, _mm_setzero_si128()); \
+ *(uint32_t*)(dstI + row * dstStride + col) = _mm_cvtsi128_si32(sumi); \
+ sume = _mm_sra_epi32(_mm_add_epi32(sume, _mm_set1_epi32(offset)), _mm_cvtsi32_si128(12)); \
CLIP0(sume) \
tmp = _mm_packs_epi32(sume, _mm_setzero_si128()); \
- *(uint32_t*)(dstE + row * dstStride + col) = _mm_cvtsi128_si32(_mm_packus_epi16(tmp, _mm_setzero_si128())); \
- sump = (sump + offset) >> 12; \
+ sume = _mm_packus_epi16(tmp, _mm_setzero_si128()); \
+ *(uint32_t*)(dstE + row * dstStride + col) = _mm_cvtsi128_si32(sume); \
+ sump = _mm_sra_epi32(_mm_add_epi32(sump, _mm_set1_epi32(offset)), _mm_cvtsi32_si128(12)); \
CLIP0(sump) \
tmp = _mm_packs_epi32(sump, _mm_setzero_si128()); \
- *(uint32_t*)(dstP + row * dstStride + col) = _mm_cvtsi128_si32(_mm_packus_epi16(tmp, _mm_setzero_si128())); \
+ sump = _mm_packus_epi16(tmp, _mm_setzero_si128()); \
+ *(uint32_t*)(dstP + row * dstStride + col) = _mm_cvtsi128_si32(sump); \
}
#endif /* if INSTRSET >= 5 */
-void CDECL filterVertical_short_pel_multiplane(int /*bitDepth*/, short *src, int srcStride, pixel *dstE, pixel *dstI, pixel *dstP, int dstStride, int block_width, int block_height)
+#if INSTRSET >= 5
+#define EXTENDCOL(X, Y) { /*X=0 for leftmost column, X=block_width+marginX for rightmost column*/ \
+ tmp16e = _mm_shuffle_epi8(sume, _mm_set1_epi8(Y)); \
+ tmp16i = _mm_shuffle_epi8(sumi, _mm_set1_epi8(Y)); \
+ tmp16p = _mm_shuffle_epi8(sump, _mm_set1_epi8(Y)); \
+ for (int i = -marginX; i < -16; i += 16) \
+ { \
+ _mm_storeu_si128((__m128i*)(dstE + row * dstStride + X + i), tmp16e); \
+ _mm_storeu_si128((__m128i*)(dstI + row * dstStride + X + i), tmp16i); \
+ _mm_storeu_si128((__m128i*)(dstP + row * dstStride + X + i), tmp16p); \
+ } \
+ _mm_storeu_si128((__m128i*)(dstE + row * dstStride + X - 16), tmp16e); /*Assuming marginX > 16*/ \
+ _mm_storeu_si128((__m128i*)(dstI + row * dstStride + X - 16), tmp16i); \
+ _mm_storeu_si128((__m128i*)(dstP + row * dstStride + X - 16), tmp16p); \
+}
+#else /* if INSTRSET >= 5 */
+#define EXTENDCOL(X, Y) { /*X=0 for leftmost column, X=block_width+marginX for rightmost column*/ \
+ tmp16e = permute16uc<Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y>((Vec16uc)sume); \
+ tmp16i = permute16uc<Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y>((Vec16uc)sumi); \
+ tmp16p = permute16uc<Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y>((Vec16uc)sump); \
+ for (int i = -marginX; i < -16; i += 16) \
+ { \
+ tmp16e.store(dstE + row * dstStride + X + i); \
+ tmp16i.store(dstI + row * dstStride + X + i); \
+ tmp16p.store(dstP + row * dstStride + X + i); \
+ } \
+ tmp16e.store(dstE + row * dstStride + X - 16); /*Assuming marginX > 16*/ \
+ tmp16i.store(dstI + row * dstStride + X - 16); \
+ tmp16p.store(dstP + row * dstStride + X - 16); \
+}
+#endif /* if INSTRSET >= 5 */
+
+void CDECL filterVerticalMultiplaneExtend(int /*bitDepth*/, short *src, int srcStride, pixel *dstE, pixel *dstI, pixel *dstP, int dstStride, int block_width, int block_height, int marginX, int marginY)
{
int row, col;
int cstride = srcStride;
@@ -323,29 +361,68 @@
offset = 1 << (shift - 1);
offset += IF_INTERNAL_OFFS << IF_FILTER_PREC;
+#if INSTRSET < 5
+ __m128i greater;
+ Vec16uc tmp16e, tmp16i, tmp16p;
Vec4i a0, a1, a2, a3, a4, a5, a6, a7, sum;
Vec8s tmp;
Vec4i val, sume, sumi, sump;
Vec4i exp1, exp2, exp3, exp4, exp5, exp6;
-#if INSTRSET < 5
- __m128i greater;
-#endif
- for (col = 0; col < block_width; col += 4) // Considering block width is always a multiple of 4
+#else
+ __m128i tmp16e, tmp16i, tmp16p;
+ __m128i a0, a1, a2, a3, a4, a5, a6, a7;
+ __m128i tmp;
+ __m128i sume, sumi, sump;
+ __m128i exp1, exp2, exp3, exp4, exp5, exp6;
+#endif /* if INSTRSET < 5 */
+
+ col = 0;
+
+ tmp = _mm_loadu_si128((__m128i const*)(src + col));
+ a0 = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+ tmp = _mm_loadu_si128((__m128i const*)(src + col + cstride));
+ a1 = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+ tmp = _mm_loadu_si128((__m128i const*)(src + col + 2 * cstride));
+ a2 = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+ tmp = _mm_loadu_si128((__m128i const*)(src + col + 3 * cstride));
+ a3 = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+ tmp = _mm_loadu_si128((__m128i const*)(src + col + 4 * cstride));
+ a4 = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+ tmp = _mm_loadu_si128((__m128i const*)(src + col + 5 * cstride));
+ a5 = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+ tmp = _mm_loadu_si128((__m128i const*)(src + col + 6 * cstride));
+ a6 = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+
+ for (row = 0; row < block_height; row++)
{
- tmp.load(src + col);
- a0 = extend_low(tmp);
- tmp.load(src + col + cstride);
- a1 = extend_low(tmp);
- tmp.load(src + col + 2 * cstride);
- a2 = extend_low(tmp);
- tmp.load(src + col + 3 * cstride);
- a3 = extend_low(tmp);
- tmp.load(src + col + 4 * cstride);
- a4 = extend_low(tmp);
- tmp.load(src + col + 5 * cstride);
- a5 = extend_low(tmp);
- tmp.load(src + col + 6 * cstride);
- a6 = extend_low(tmp);
+ PROCESSROW(a0, a1, a2, a3, a4, a5, a6, a7) EXTENDCOL(0, 0) row++;
+ PROCESSROW(a1, a2, a3, a4, a5, a6, a7, a0) EXTENDCOL(0, 0) row++;
+ PROCESSROW(a2, a3, a4, a5, a6, a7, a0, a1) EXTENDCOL(0, 0) row++;
+ PROCESSROW(a3, a4, a5, a6, a7, a0, a1, a2) EXTENDCOL(0, 0) row++;
+ PROCESSROW(a4, a5, a6, a7, a0, a1, a2, a3) EXTENDCOL(0, 0) row++;
+ PROCESSROW(a5, a6, a7, a0, a1, a2, a3, a4) EXTENDCOL(0, 0) row++;
+ PROCESSROW(a6, a7, a0, a1, a2, a3, a4, a5) EXTENDCOL(0, 0) row++;
+ PROCESSROW(a7, a0, a1, a2, a3, a4, a5, a6) EXTENDCOL(0, 0)
+ }
+
+ col += 4;
+
+ for ( /*col = 0*/; col < block_width - 4; col += 4) // Considering block width is always a multiple of 4
+ {
+ tmp = _mm_loadu_si128((__m128i const*)(src + col));
+ a0 = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+ tmp = _mm_loadu_si128((__m128i const*)(src + col + cstride));
+ a1 = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+ tmp = _mm_loadu_si128((__m128i const*)(src + col + 2 * cstride));
+ a2 = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+ tmp = _mm_loadu_si128((__m128i const*)(src + col + 3 * cstride));
+ a3 = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+ tmp = _mm_loadu_si128((__m128i const*)(src + col + 4 * cstride));
+ a4 = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+ tmp = _mm_loadu_si128((__m128i const*)(src + col + 5 * cstride));
+ a5 = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+ tmp = _mm_loadu_si128((__m128i const*)(src + col + 6 * cstride));
+ a6 = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
for (row = 0; row < block_height; row++)
{
@@ -359,6 +436,89 @@
PROCESSROW(a7, a0, a1, a2, a3, a4, a5, a6)
}
}
+
+ tmp = _mm_loadu_si128((__m128i const*)(src + col));
+ a0 = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+ tmp = _mm_loadu_si128((__m128i const*)(src + col + cstride));
+ a1 = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+ tmp = _mm_loadu_si128((__m128i const*)(src + col + 2 * cstride));
+ a2 = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+ tmp = _mm_loadu_si128((__m128i const*)(src + col + 3 * cstride));
+ a3 = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+ tmp = _mm_loadu_si128((__m128i const*)(src + col + 4 * cstride));
+ a4 = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+ tmp = _mm_loadu_si128((__m128i const*)(src + col + 5 * cstride));
+ a5 = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+ tmp = _mm_loadu_si128((__m128i const*)(src + col + 6 * cstride));
+ a6 = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+
+ for (row = 0; row < block_height; row++)
+ {
+ PROCESSROW(a0, a1, a2, a3, a4, a5, a6, a7) EXTENDCOL((block_width + marginX), 3) row++;
+ PROCESSROW(a1, a2, a3, a4, a5, a6, a7, a0) EXTENDCOL((block_width + marginX), 3) row++;
+ PROCESSROW(a2, a3, a4, a5, a6, a7, a0, a1) EXTENDCOL((block_width + marginX), 3) row++;
+ PROCESSROW(a3, a4, a5, a6, a7, a0, a1, a2) EXTENDCOL((block_width + marginX), 3) row++;
+ PROCESSROW(a4, a5, a6, a7, a0, a1, a2, a3) EXTENDCOL((block_width + marginX), 3) row++;
+ PROCESSROW(a5, a6, a7, a0, a1, a2, a3, a4) EXTENDCOL((block_width + marginX), 3) row++;
+ PROCESSROW(a6, a7, a0, a1, a2, a3, a4, a5) EXTENDCOL((block_width + marginX), 3) row++;
+ PROCESSROW(a7, a0, a1, a2, a3, a4, a5, a6) EXTENDCOL((block_width + marginX), 3)
+ }
+
+ // Extending bottom rows
+ pixel *pe, *pi, *pp;
+ pe = dstE + (block_height - 1) * dstStride - marginX;
+ pi = dstI + (block_height - 1) * dstStride - marginX;
+ pp = dstP + (block_height - 1) * dstStride - marginX;
+ int x, y;
+ for (x = 0; x < block_width + (marginX << 1) - 16; x += 16)
+ {
+ tmp16e = _mm_loadu_si128((__m128i const*)(pe + x));
+ tmp16i = _mm_loadu_si128((__m128i const*)(pi + x));
+ tmp16p = _mm_loadu_si128((__m128i const*)(pp + x));
+ for (y = 0; y < marginY; y++)
+ {
+ _mm_storeu_si128((__m128i*)(pe + (y + 1) * dstStride + x), tmp16e);
+ _mm_storeu_si128((__m128i*)(pi + (y + 1) * dstStride + x), tmp16i);
+ _mm_storeu_si128((__m128i*)(pp + (y + 1) * dstStride + x), tmp16p);
+ }
+ }
+
+ tmp16e = _mm_loadu_si128((__m128i const*)(pe + block_width + (marginX << 1) - 16));
+ tmp16i = _mm_loadu_si128((__m128i const*)(pi + block_width + (marginX << 1) - 16));
+ tmp16p = _mm_loadu_si128((__m128i const*)(pp + block_width + (marginX << 1) - 16));
+ for (y = 0; y < marginY; y++)
+ {
+ _mm_storeu_si128((__m128i*)(pe + (y + 1) * dstStride + block_width + (marginX << 1) - 16), tmp16e);
+ _mm_storeu_si128((__m128i*)(pi + (y + 1) * dstStride + block_width + (marginX << 1) - 16), tmp16i);
+ _mm_storeu_si128((__m128i*)(pp + (y + 1) * dstStride + block_width + (marginX << 1) - 16), tmp16p);
+ }
+
+ // Extending top rows
+ pe -= ((block_height - 1) * dstStride);
+ pi -= ((block_height - 1) * dstStride);
+ pp -= ((block_height - 1) * dstStride);
+ for (x = 0; x < block_width + (marginX << 1) - 16; x += 16)
+ {
+ tmp16e = _mm_loadu_si128((__m128i const*)(pe + x));
+ tmp16i = _mm_loadu_si128((__m128i const*)(pi + x));
+ tmp16p = _mm_loadu_si128((__m128i const*)(pp + x));
+ for (y = 0; y < marginY; y++)
+ {
+ _mm_storeu_si128((__m128i*)(pe - (y + 1) * dstStride + x), tmp16e);
+ _mm_storeu_si128((__m128i*)(pi - (y + 1) * dstStride + x), tmp16i);
+ _mm_storeu_si128((__m128i*)(pp - (y + 1) * dstStride + x), tmp16p);
+ }
+ }
+
+ tmp16e = _mm_loadu_si128((__m128i const*)(pe + block_width + (marginX << 1) - 16));
+ tmp16i = _mm_loadu_si128((__m128i const*)(pi + block_width + (marginX << 1) - 16));
+ tmp16p = _mm_loadu_si128((__m128i const*)(pp + block_width + (marginX << 1) - 16));
+ for (y = 0; y < marginY; y++)
+ {
+ _mm_storeu_si128((__m128i*)(pe - (y + 1) * dstStride + block_width + (marginX << 1) - 16), tmp16e);
+ _mm_storeu_si128((__m128i*)(pi - (y + 1) * dstStride + block_width + (marginX << 1) - 16), tmp16i);
+ _mm_storeu_si128((__m128i*)(pp - (y + 1) * dstStride + block_width + (marginX << 1) - 16), tmp16p);
+ }
}
template<int N>
diff -r 0e1117a1e892 -r 2687975a2ccb source/test/ipfilterharness.cpp
--- a/source/test/ipfilterharness.cpp Tue Jun 18 14:50:03 2013 +0530
+++ b/source/test/ipfilterharness.cpp Thu Jun 20 13:56:16 2013 +0530
@@ -264,43 +264,45 @@
{
short rand_height = 32; // Can be randomly generated Height
short rand_width = 32; // Can be randomly generated Width
+ int marginX = 64;
+ int marginY = 64;
short rand_srcStride, rand_dstStride;
- pixel dstEvec[100 * 100];
- pixel dstIvec[100 * 100];
- pixel dstPvec[100 * 100];
+ pixel dstEvec[200 * 200];
+ pixel dstIvec[200 * 200];
+ pixel dstPvec[200 * 200];
- pixel dstEref[100 * 100];
- pixel dstIref[100 * 100];
- pixel dstPref[100 * 100];
+ pixel dstEref[200 * 200];
+ pixel dstIref[200 * 200];
+ pixel dstPref[200 * 200];
- memset(dstEref, 0, 10000 * sizeof(pixel));
- memset(dstIref, 0, 10000 * sizeof(pixel));
- memset(dstPref, 0, 10000 * sizeof(pixel));
+ memset(dstEref, 0, 40000 * sizeof(pixel));
+ memset(dstIref, 0, 40000 * sizeof(pixel));
+ memset(dstPref, 0, 40000 * sizeof(pixel));
- memset(dstEvec, 0, 10000 * sizeof(pixel));
- memset(dstIvec, 0, 10000 * sizeof(pixel));
- memset(dstPvec, 0, 10000 * sizeof(pixel));
+ memset(dstEvec, 0, 40000 * sizeof(pixel));
+ memset(dstIvec, 0, 40000 * sizeof(pixel));
+ memset(dstPvec, 0, 40000 * sizeof(pixel));
for (int i = 0; i <= 100; i++)
{
- rand_srcStride = 64; // Can be randomly generated
- rand_dstStride = 64;
+ rand_srcStride = 200; // Can be randomly generated
+ rand_dstStride = 200;
opt(8, short_buff + 8 * rand_srcStride,
rand_srcStride,
- dstEvec, dstIvec, dstPvec,
+ dstEvec + marginY * rand_dstStride + marginX, dstIvec + marginY * rand_dstStride + marginX, dstPvec + marginY * rand_dstStride + marginX,
rand_dstStride,
rand_width,
- rand_height);
+ rand_height, marginX, marginY);
ref(8, short_buff + 8 * rand_srcStride,
rand_srcStride,
- dstEref, dstIref, dstPref,
+ dstEref + marginY * rand_dstStride + marginX, dstIref + marginY * rand_dstStride + marginX, dstPref + marginY * rand_dstStride + marginX,
rand_dstStride,
rand_width,
- rand_height);
+ rand_height, marginX, marginY);
- if (memcmp(dstEvec, dstEref, 100 * 100 * sizeof(pixel))
- || memcmp(dstIvec, dstIref, 100 * 100 * sizeof(pixel)) || memcmp(dstPvec, dstPref, 100 * 100 * sizeof(pixel)))
+ if (memcmp(dstEvec, dstEref, 200 * 200 * sizeof(pixel))
+ || memcmp(dstIvec, dstIref, 200 * 200 * sizeof(pixel)) || memcmp(dstPvec, dstPref, 200 * 200 * sizeof(pixel)))
{
return false;
}
@@ -508,7 +510,7 @@
{
printf("Filter-V-multiplane\t");
REPORT_SPEEDUP(opt.filterVmulti, ref.filterVmulti,
- 8, short_buff + 8 * srcStride, srcStride, IPF_C_output_p, IPF_vec_output_p, IPF_C_output_p, dstStride, width, height);
+ 8, short_buff + 8 * srcStride, srcStride, IPF_C_output_p + 64 * 200 + 64, IPF_vec_output_p + 64 * 200 + 64, IPF_C_output_p + 64 * 200 + 64, dstStride, width, height, 64, 64);
}
if (opt.filterHmulti)
-------------- next part --------------
A non-text attachment was scrubbed...
Name: xhevc_deepthid.patch
Type: text/x-patch
Size: 24426 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20130620/6446f76d/attachment-0001.bin>
More information about the x265-devel
mailing list