[x265] [PATCH] Vertical filter with Border extend

Thu Jun 20 10:49:49 CEST 2013

# HG changeset patch
# User Deepthi Devaki
# Date 1371716776 -19800
# Node ID 2687975a2ccb545f4fb8cd3061f06b85f0f54e71
# Parent  0e1117a1e892249838114f6c91caad7b0f206871
Vertical filter with Border extend

diff -r 0e1117a1e892 -r 2687975a2ccb source/common/ipfilter.cpp

--- a/source/common/ipfilter.cpp	Tue Jun 18 14:50:03 2013 +0530
+++ b/source/common/ipfilter.cpp	Thu Jun 20 13:56:16 2013 +0530
@@ -310,6 +310,46 @@
     filterVertical_short_pel<8>(bitDepth, src, srcStride, dstP, dstStride, block_width, block_height, TComInterpolationFilter::m_lumaFilter[3]);
 }
 
+void xExtendPicCompBorder(pixel* piTxt, int iStride, int iWidth, int iHeight, int iMarginX, int iMarginY)
+{
+    int   x, y;
+    pixel*  pi;
+
+    pi = piTxt;
+    for (y = 0; y < iHeight; y++)
+    {
+        for (x = 0; x < iMarginX; x++)
+        {
+            pi[-iMarginX + x] = pi[0];
+            pi[iWidth + x] = pi[iWidth - 1];
+        }
+
+        pi += iStride;
+    }
+
+    pi -= (iStride + iMarginX);
+    for (y = 0; y < iMarginY; y++)
+    {
+        ::memcpy(pi + (y + 1) * iStride, pi, sizeof(pixel) * (iWidth + (iMarginX << 1)));
+    }
+
+    pi -= ((iHeight - 1) * iStride);
+    for (y = 0; y < iMarginY; y++)
+    {
+        ::memcpy(pi - (y + 1) * iStride, pi, sizeof(pixel) * (iWidth + (iMarginX << 1)));
+    }
+}
+
+void CDECL filterVerticalMultiplaneExtend(int bitDepth, short *src, int srcStride, pixel *dstE, pixel *dstI, pixel *dstP, int dstStride, int block_width, int block_height, int marginX, int marginY)
+{
+    filterVertical_short_pel<8>(bitDepth, src, srcStride, dstI, dstStride, block_width, block_height, TComInterpolationFilter::m_lumaFilter[2]);
+    filterVertical_short_pel<8>(bitDepth, src, srcStride, dstE, dstStride, block_width, block_height, TComInterpolationFilter::m_lumaFilter[1]);
+    filterVertical_short_pel<8>(bitDepth, src, srcStride, dstP, dstStride, block_width, block_height, TComInterpolationFilter::m_lumaFilter[3]);
+    xExtendPicCompBorder(dstE, dstStride, block_width, block_height, marginX, marginY);
+    xExtendPicCompBorder(dstI, dstStride, block_width, block_height, marginX, marginY);
+    xExtendPicCompBorder(dstP, dstStride, block_width, block_height, marginX, marginY);
+}
+
 void CDECL filterHorizontalMultiplane(int bitDepth, pixel *src, int srcStride, short *midF, short* midA, short* midB, short* midC, int midStride, pixel *pDstA, pixel *pDstB, pixel *pDstC, int pDstStride, int block_width, int block_height)
 {
     filterConvertPelToShort(bitDepth, src, srcStride, midF, midStride, block_width, block_height);
@@ -344,7 +384,7 @@
     p.ipFilter_p_p[FILTER_V_P_P_8] = filterVertical_pel_pel<8>;
     p.ipFilter_p_p[FILTER_V_P_P_4] = filterVertical_pel_pel<4>;
 
-    p.filterVmulti = filterVertical_short_pel_multiplane;
+    p.filterVmulti = filterVerticalMultiplaneExtend;
     p.filterHmulti = filterHorizontalMultiplane;
 }
 }
diff -r 0e1117a1e892 -r 2687975a2ccb source/common/primitives.h
--- a/source/common/primitives.h	Tue Jun 18 14:50:03 2013 +0530
+++ b/source/common/primitives.h	Thu Jun 20 13:56:16 2013 +0530
@@ -205,7 +205,7 @@
 typedef void (CDECL * idct_t)(int *pSrc, short *pDst, intptr_t stride);
 typedef void (CDECL * calcresidual_t)(pixel *piOrig, pixel *piPred, short *piRes, int stride);
 typedef void (CDECL * calcrecon_t)(pixel* piPred, short* piResi,pixel*  piReco, short* piRecQt, pixel *piRecIPred, int uiStride, int uiRecQtStride, int uiRecIPredStride);
-typedef void (CDECL * filterVmulti_t)(int bitDepth, short *src, int srcStride, pixel *dstE, pixel *dstI, pixel *dstP, int dstStride, int block_width, int block_height);
+typedef void (CDECL * filterVmulti_t)(int bitDepth, short *src, int srcStride, pixel *dstE, pixel *dstI, pixel *dstP, int dstStride, int block_width, int block_height,int marginX, int marginY);
 typedef void (CDECL * filterHmulti_t)(int bitDepth, pixel *src, int srcStride, short *midF, short* midA, short* midB, short* midC, int midStride, pixel *pDstA, pixel *pDstB, pixel *pDstC, int pDstStride, int block_width, int block_height);
 
 /* Define a structure containing function pointers to optimized encoder
diff -r 0e1117a1e892 -r 2687975a2ccb source/common/reference.cpp
--- a/source/common/reference.cpp	Tue Jun 18 14:50:03 2013 +0530
+++ b/source/common/reference.cpp	Thu Jun 20 13:56:16 2013 +0530
@@ -157,13 +157,13 @@
     pixel *dstPtr2 = m_lumaPlane[x][2] - s_tmpMarginY * m_lumaStride - s_tmpMarginX;
     pixel *dstPtr3 = m_lumaPlane[x][3] - s_tmpMarginY * m_lumaStride - s_tmpMarginX;
 
-    primitives.filterVmulti(g_bitDepthY, intPtr, m_intStride, dstPtr1, dstPtr2, dstPtr3, m_lumaStride, m_filterWidth, m_filterHeight);
+    primitives.filterVmulti(g_bitDepthY, intPtr, m_intStride, dstPtr1, dstPtr2, dstPtr3, m_lumaStride, m_filterWidth, m_filterHeight, m_reconPic->m_iLumaMarginX - s_tmpMarginX, m_reconPic->m_iLumaMarginY - s_tmpMarginY);
     if (x > 0)
     {
         int bufOffset = -(s_tmpMarginY + s_intMarginY) * m_lumaStride - (s_tmpMarginX + s_intMarginX);
         m_reconPic->xExtendPicCompBorder(m_lumaPlane[x][0] + bufOffset, m_lumaStride, m_filterWidth + (2 * s_intMarginX), m_filterHeight + (2 * s_intMarginY), m_reconPic->m_iLumaMarginX - s_tmpMarginX - s_intMarginX, m_reconPic->m_iLumaMarginY - s_tmpMarginY - s_intMarginY);
     }
-    m_reconPic->xExtendPicCompBorder(dstPtr1, m_lumaStride, m_filterWidth, m_filterHeight, m_reconPic->m_iLumaMarginX - s_tmpMarginX, m_reconPic->m_iLumaMarginY - s_tmpMarginY);
+    /*m_reconPic->xExtendPicCompBorder(dstPtr1, m_lumaStride, m_filterWidth, m_filterHeight, m_reconPic->m_iLumaMarginX - s_tmpMarginX, m_reconPic->m_iLumaMarginY - s_tmpMarginY);
     m_reconPic->xExtendPicCompBorder(dstPtr2, m_lumaStride, m_filterWidth, m_filterHeight, m_reconPic->m_iLumaMarginX - s_tmpMarginX, m_reconPic->m_iLumaMarginY - s_tmpMarginY);
-    m_reconPic->xExtendPicCompBorder(dstPtr3, m_lumaStride, m_filterWidth, m_filterHeight, m_reconPic->m_iLumaMarginX - s_tmpMarginX, m_reconPic->m_iLumaMarginY - s_tmpMarginY);
+    m_reconPic->xExtendPicCompBorder(dstPtr3, m_lumaStride, m_filterWidth, m_filterHeight, m_reconPic->m_iLumaMarginX - s_tmpMarginX, m_reconPic->m_iLumaMarginY - s_tmpMarginY);*/
 }
diff -r 0e1117a1e892 -r 2687975a2ccb source/common/vec/ipfilter.inc
--- a/source/common/vec/ipfilter.inc	Tue Jun 18 14:50:03 2013 +0530
+++ b/source/common/vec/ipfilter.inc	Thu Jun 20 13:56:16 2013 +0530
@@ -41,8 +41,8 @@
     p.ipFilter_p_p[FILTER_V_P_P_4] = filterVertical_pel_pel<4>;
 
 #if !HIGH_BIT_DEPTH
-     p.filterVmulti = filterVertical_short_pel_multiplane;
-     p.filterHmulti = filterHorizontalMultiplane;
-#endif 
+    p.filterVmulti = filterVerticalMultiplaneExtend;
+    p.filterHmulti = filterHorizontalMultiplane;
+#endif
 }
 }
diff -r 0e1117a1e892 -r 2687975a2ccb source/common/vec/ipfilter8.inc
--- a/source/common/vec/ipfilter8.inc	Tue Jun 18 14:50:03 2013 +0530
+++ b/source/common/vec/ipfilter8.inc	Thu Jun 20 13:56:16 2013 +0530
@@ -260,15 +260,18 @@
         sumi = _mm_sra_epi32(_mm_add_epi32(sumi, _mm_set1_epi32(offset)), _mm_cvtsi32_si128(12)); \
         CLIP0(sumi) \
         tmp  =  _mm_packs_epi32(sumi, _mm_setzero_si128()); \
-        *(uint32_t*)(dstI + row * dstStride + col) = _mm_cvtsi128_si32(_mm_packus_epi16(tmp, _mm_setzero_si128())); \
+        sumi = _mm_packus_epi16(tmp, _mm_setzero_si128()); \
+        *(uint32_t*)(dstI + row * dstStride + col) = _mm_cvtsi128_si32(sumi); \
         sume = _mm_sra_epi32(_mm_add_epi32(sume, _mm_set1_epi32(offset)), _mm_cvtsi32_si128(12)); \
         CLIP0(sume) \
         tmp  =  _mm_packs_epi32(sume, _mm_setzero_si128()); \
-        *(uint32_t*)(dstE + row * dstStride + col) = _mm_cvtsi128_si32(_mm_packus_epi16(tmp, _mm_setzero_si128())); \
+        sume = _mm_packus_epi16(tmp, _mm_setzero_si128()); \
+        *(uint32_t*)(dstE + row * dstStride + col) = _mm_cvtsi128_si32(sume); \
         sump = _mm_sra_epi32(_mm_add_epi32(sump, _mm_set1_epi32(offset)), _mm_cvtsi32_si128(12)); \
         CLIP0(sump) \
         tmp  =  _mm_packs_epi32(sump, _mm_setzero_si128()); \
-        *(uint32_t*)(dstP + row * dstStride + col) = _mm_cvtsi128_si32(_mm_packus_epi16(tmp, _mm_setzero_si128())); \
+        sump = _mm_packus_epi16(tmp, _mm_setzero_si128()); \
+        *(uint32_t*)(dstP + row * dstStride + col) = _mm_cvtsi128_si32(sump); \
 }
 #else /* if INSTRSET >= 5 */
 #define PROCESSROW(a0, a1, a2, a3, a4, a5, a6, a7) { \
@@ -295,22 +298,57 @@
         sumi = exp1 - a2 + exp2 + exp5 + exp6 -   a5; \
         sump = a1 - 5 * a2 + exp3 + exp4 + exp5 + a4 + exp6; \
 /* store results */ \
-        sumi = (sumi + offset) >> 12; \
+        sumi = _mm_sra_epi32(_mm_add_epi32(sumi, _mm_set1_epi32(offset)), _mm_cvtsi32_si128(12)); \
         CLIP0(sumi) \
         tmp  =  _mm_packs_epi32(sumi, _mm_setzero_si128()); \
-        *(uint32_t*)(dstI + row * dstStride + col) = _mm_cvtsi128_si32(_mm_packus_epi16(tmp, _mm_setzero_si128())); \
-        sume = (sume + offset) >> 12; \
+        sumi = _mm_packus_epi16(tmp, _mm_setzero_si128()); \
+        *(uint32_t*)(dstI + row * dstStride + col) = _mm_cvtsi128_si32(sumi); \
+        sume = _mm_sra_epi32(_mm_add_epi32(sume, _mm_set1_epi32(offset)), _mm_cvtsi32_si128(12)); \
         CLIP0(sume) \
         tmp  =  _mm_packs_epi32(sume, _mm_setzero_si128()); \
-        *(uint32_t*)(dstE + row * dstStride + col) = _mm_cvtsi128_si32(_mm_packus_epi16(tmp, _mm_setzero_si128())); \
-        sump = (sump + offset) >> 12; \
+        sume = _mm_packus_epi16(tmp, _mm_setzero_si128()); \
+        *(uint32_t*)(dstE + row * dstStride + col) = _mm_cvtsi128_si32(sume); \
+        sump = _mm_sra_epi32(_mm_add_epi32(sump, _mm_set1_epi32(offset)), _mm_cvtsi32_si128(12)); \
         CLIP0(sump) \
         tmp  =  _mm_packs_epi32(sump, _mm_setzero_si128()); \
-        *(uint32_t*)(dstP + row * dstStride + col) = _mm_cvtsi128_si32(_mm_packus_epi16(tmp, _mm_setzero_si128())); \
+        sump = _mm_packus_epi16(tmp, _mm_setzero_si128()); \
+        *(uint32_t*)(dstP + row * dstStride + col) = _mm_cvtsi128_si32(sump); \
 }
 #endif /* if INSTRSET >= 5 */
 
-void CDECL filterVertical_short_pel_multiplane(int /*bitDepth*/, short *src, int srcStride, pixel *dstE, pixel *dstI, pixel *dstP, int dstStride, int block_width, int block_height)
+#if INSTRSET >= 5
+#define EXTENDCOL(X, Y) { /*X=0 for leftmost column, X=block_width+marginX for rightmost column*/ \
+        tmp16e = _mm_shuffle_epi8(sume, _mm_set1_epi8(Y)); \
+        tmp16i = _mm_shuffle_epi8(sumi, _mm_set1_epi8(Y)); \
+        tmp16p = _mm_shuffle_epi8(sump, _mm_set1_epi8(Y)); \
+        for (int i = -marginX; i < -16; i += 16) \
+        { \
+            _mm_storeu_si128((__m128i*)(dstE + row * dstStride + X + i), tmp16e); \
+            _mm_storeu_si128((__m128i*)(dstI + row * dstStride + X + i), tmp16i); \
+            _mm_storeu_si128((__m128i*)(dstP + row * dstStride + X + i), tmp16p); \
+        } \
+        _mm_storeu_si128((__m128i*)(dstE + row * dstStride + X - 16), tmp16e); /*Assuming marginX > 16*/ \
+        _mm_storeu_si128((__m128i*)(dstI + row * dstStride + X - 16), tmp16i); \
+        _mm_storeu_si128((__m128i*)(dstP + row * dstStride + X - 16), tmp16p); \
+}
+#else /* if INSTRSET >= 5 */
+#define EXTENDCOL(X, Y) { /*X=0 for leftmost column, X=block_width+marginX for rightmost column*/ \
+        tmp16e = permute16uc<Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y>((Vec16uc)sume); \
+        tmp16i = permute16uc<Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y>((Vec16uc)sumi); \
+        tmp16p = permute16uc<Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y>((Vec16uc)sump); \
+        for (int i = -marginX; i < -16; i += 16) \
+        { \
+            tmp16e.store(dstE + row * dstStride + X + i); \
+            tmp16i.store(dstI + row * dstStride + X + i); \
+            tmp16p.store(dstP + row * dstStride + X + i); \
+        } \
+        tmp16e.store(dstE + row * dstStride + X - 16);      /*Assuming marginX > 16*/ \
+        tmp16i.store(dstI + row * dstStride + X - 16);      \
+        tmp16p.store(dstP + row * dstStride + X - 16);      \
+}
+#endif /* if INSTRSET >= 5 */
+
+void CDECL filterVerticalMultiplaneExtend(int /*bitDepth*/, short *src, int srcStride, pixel *dstE, pixel *dstI, pixel *dstP, int dstStride, int block_width, int block_height, int marginX, int marginY)
 {
     int row, col;
     int cstride =  srcStride;
@@ -323,29 +361,68 @@
     offset = 1 << (shift - 1);
     offset +=  IF_INTERNAL_OFFS << IF_FILTER_PREC;
 
+#if INSTRSET < 5
+    __m128i greater;
+    Vec16uc tmp16e, tmp16i, tmp16p;
     Vec4i a0, a1, a2, a3, a4, a5, a6, a7, sum;
     Vec8s tmp;
     Vec4i val, sume, sumi, sump;
     Vec4i exp1, exp2, exp3, exp4, exp5, exp6;
-#if INSTRSET < 5
-    __m128i greater;
-#endif
-    for (col = 0; col < block_width; col += 4)         // Considering block width is always a multiple of 4
+#else
+    __m128i tmp16e, tmp16i, tmp16p;
+    __m128i a0, a1, a2, a3, a4, a5, a6, a7;
+    __m128i tmp;
+    __m128i sume, sumi, sump;
+    __m128i exp1, exp2, exp3, exp4, exp5, exp6;
+#endif /* if INSTRSET < 5 */
+
+    col = 0;
+
+    tmp = _mm_loadu_si128((__m128i const*)(src + col));
+    a0  = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+    tmp = _mm_loadu_si128((__m128i const*)(src + col + cstride));
+    a1  = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+    tmp = _mm_loadu_si128((__m128i const*)(src + col + 2 * cstride));
+    a2  = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+    tmp = _mm_loadu_si128((__m128i const*)(src + col + 3 * cstride));
+    a3  = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+    tmp = _mm_loadu_si128((__m128i const*)(src + col + 4 * cstride));
+    a4  = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+    tmp = _mm_loadu_si128((__m128i const*)(src + col + 5 * cstride));
+    a5  = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+    tmp = _mm_loadu_si128((__m128i const*)(src + col + 6 * cstride));
+    a6  = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+
+    for (row = 0; row < block_height; row++)
     {
-        tmp.load(src + col);
-        a0 = extend_low(tmp);
-        tmp.load(src + col + cstride);
-        a1 = extend_low(tmp);
-        tmp.load(src + col + 2 * cstride);
-        a2 = extend_low(tmp);
-        tmp.load(src + col + 3 * cstride);
-        a3 = extend_low(tmp);
-        tmp.load(src + col + 4 * cstride);
-        a4 = extend_low(tmp);
-        tmp.load(src + col + 5 * cstride);
-        a5 = extend_low(tmp);
-        tmp.load(src + col + 6 * cstride);
-        a6 = extend_low(tmp);
+        PROCESSROW(a0, a1, a2, a3, a4, a5, a6, a7) EXTENDCOL(0, 0) row++;
+        PROCESSROW(a1, a2, a3, a4, a5, a6, a7, a0) EXTENDCOL(0, 0) row++;
+        PROCESSROW(a2, a3, a4, a5, a6, a7, a0, a1) EXTENDCOL(0, 0) row++;
+        PROCESSROW(a3, a4, a5, a6, a7, a0, a1, a2) EXTENDCOL(0, 0) row++;
+        PROCESSROW(a4, a5, a6, a7, a0, a1, a2, a3) EXTENDCOL(0, 0) row++;
+        PROCESSROW(a5, a6, a7, a0, a1, a2, a3, a4) EXTENDCOL(0, 0) row++;
+        PROCESSROW(a6, a7, a0, a1, a2, a3, a4, a5) EXTENDCOL(0, 0) row++;
+        PROCESSROW(a7, a0, a1, a2, a3, a4, a5, a6) EXTENDCOL(0, 0)
+    }
+
+    col += 4;
+
+    for ( /*col = 0*/; col < block_width - 4; col += 4)         // Considering block width is always a multiple of 4
+    {
+        tmp = _mm_loadu_si128((__m128i const*)(src + col));
+        a0  = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+        tmp = _mm_loadu_si128((__m128i const*)(src + col + cstride));
+        a1  = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+        tmp = _mm_loadu_si128((__m128i const*)(src + col + 2 * cstride));
+        a2  = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+        tmp = _mm_loadu_si128((__m128i const*)(src + col + 3 * cstride));
+        a3  = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+        tmp = _mm_loadu_si128((__m128i const*)(src + col + 4 * cstride));
+        a4  = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+        tmp = _mm_loadu_si128((__m128i const*)(src + col + 5 * cstride));
+        a5  = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+        tmp = _mm_loadu_si128((__m128i const*)(src + col + 6 * cstride));
+        a6  = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
 
         for (row = 0; row < block_height; row++)
         {
@@ -359,6 +436,89 @@
             PROCESSROW(a7, a0, a1, a2, a3, a4, a5, a6)
         }
     }
+
+    tmp = _mm_loadu_si128((__m128i const*)(src + col));
+    a0  = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+    tmp = _mm_loadu_si128((__m128i const*)(src + col + cstride));
+    a1  = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+    tmp = _mm_loadu_si128((__m128i const*)(src + col + 2 * cstride));
+    a2  = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+    tmp = _mm_loadu_si128((__m128i const*)(src + col + 3 * cstride));
+    a3  = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+    tmp = _mm_loadu_si128((__m128i const*)(src + col + 4 * cstride));
+    a4  = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+    tmp = _mm_loadu_si128((__m128i const*)(src + col + 5 * cstride));
+    a5  = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+    tmp = _mm_loadu_si128((__m128i const*)(src + col + 6 * cstride));
+    a6  = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
+
+    for (row = 0; row < block_height; row++)
+    {
+        PROCESSROW(a0, a1, a2, a3, a4, a5, a6, a7) EXTENDCOL((block_width + marginX), 3) row++;
+        PROCESSROW(a1, a2, a3, a4, a5, a6, a7, a0) EXTENDCOL((block_width + marginX), 3) row++;
+        PROCESSROW(a2, a3, a4, a5, a6, a7, a0, a1) EXTENDCOL((block_width + marginX), 3) row++;
+        PROCESSROW(a3, a4, a5, a6, a7, a0, a1, a2) EXTENDCOL((block_width + marginX), 3) row++;
+        PROCESSROW(a4, a5, a6, a7, a0, a1, a2, a3) EXTENDCOL((block_width + marginX), 3) row++;
+        PROCESSROW(a5, a6, a7, a0, a1, a2, a3, a4) EXTENDCOL((block_width + marginX), 3) row++;
+        PROCESSROW(a6, a7, a0, a1, a2, a3, a4, a5) EXTENDCOL((block_width + marginX), 3) row++;
+        PROCESSROW(a7, a0, a1, a2, a3, a4, a5, a6) EXTENDCOL((block_width + marginX), 3)
+    }
+
+    // Extending bottom rows
+    pixel *pe, *pi, *pp;
+    pe = dstE + (block_height - 1) * dstStride - marginX;
+    pi = dstI + (block_height - 1) * dstStride - marginX;
+    pp = dstP + (block_height - 1) * dstStride - marginX;
+    int x, y;
+    for (x = 0; x < block_width + (marginX << 1) - 16; x += 16)
+    {
+        tmp16e = _mm_loadu_si128((__m128i const*)(pe + x));
+        tmp16i = _mm_loadu_si128((__m128i const*)(pi + x));
+        tmp16p = _mm_loadu_si128((__m128i const*)(pp + x));
+        for (y = 0; y < marginY; y++)
+        {
+            _mm_storeu_si128((__m128i*)(pe + (y + 1) * dstStride + x), tmp16e);
+            _mm_storeu_si128((__m128i*)(pi + (y + 1) * dstStride + x), tmp16i);
+            _mm_storeu_si128((__m128i*)(pp + (y + 1) * dstStride + x), tmp16p);
+        }
+    }
+
+    tmp16e = _mm_loadu_si128((__m128i const*)(pe + block_width + (marginX << 1) - 16));
+    tmp16i = _mm_loadu_si128((__m128i const*)(pi + block_width + (marginX << 1) - 16));
+    tmp16p = _mm_loadu_si128((__m128i const*)(pp + block_width + (marginX << 1) - 16));
+    for (y = 0; y < marginY; y++)
+    {
+        _mm_storeu_si128((__m128i*)(pe + (y + 1) * dstStride + block_width + (marginX << 1) - 16), tmp16e);
+        _mm_storeu_si128((__m128i*)(pi + (y + 1) * dstStride + block_width + (marginX << 1) - 16), tmp16i);
+        _mm_storeu_si128((__m128i*)(pp + (y + 1) * dstStride + block_width + (marginX << 1) - 16), tmp16p);
+    }
+
+    // Extending top rows
+    pe -= ((block_height - 1) * dstStride);
+    pi -= ((block_height - 1) * dstStride);
+    pp -= ((block_height - 1) * dstStride);
+    for (x = 0; x < block_width + (marginX << 1) - 16; x += 16)
+    {
+        tmp16e = _mm_loadu_si128((__m128i const*)(pe + x));
+        tmp16i = _mm_loadu_si128((__m128i const*)(pi + x));
+        tmp16p = _mm_loadu_si128((__m128i const*)(pp + x));
+        for (y = 0; y < marginY; y++)
+        {
+            _mm_storeu_si128((__m128i*)(pe - (y + 1) * dstStride + x), tmp16e);
+            _mm_storeu_si128((__m128i*)(pi - (y + 1) * dstStride + x), tmp16i);
+            _mm_storeu_si128((__m128i*)(pp - (y + 1) * dstStride + x), tmp16p);
+        }
+    }
+
+    tmp16e = _mm_loadu_si128((__m128i const*)(pe + block_width + (marginX << 1) - 16));
+    tmp16i = _mm_loadu_si128((__m128i const*)(pi + block_width + (marginX << 1) - 16));
+    tmp16p = _mm_loadu_si128((__m128i const*)(pp + block_width + (marginX << 1) - 16));
+    for (y = 0; y < marginY; y++)
+    {
+        _mm_storeu_si128((__m128i*)(pe - (y + 1) * dstStride + block_width + (marginX << 1) - 16), tmp16e);
+        _mm_storeu_si128((__m128i*)(pi - (y + 1) * dstStride + block_width + (marginX << 1) - 16), tmp16i);
+        _mm_storeu_si128((__m128i*)(pp - (y + 1) * dstStride + block_width + (marginX << 1) - 16), tmp16p);
+    }
 }
 
 template<int N>
diff -r 0e1117a1e892 -r 2687975a2ccb source/test/ipfilterharness.cpp
--- a/source/test/ipfilterharness.cpp	Tue Jun 18 14:50:03 2013 +0530
+++ b/source/test/ipfilterharness.cpp	Thu Jun 20 13:56:16 2013 +0530
@@ -264,43 +264,45 @@
 {
     short rand_height = 32;                 // Can be randomly generated Height
     short rand_width = 32;                  // Can be randomly generated Width
+    int marginX = 64;
+    int marginY = 64;
     short rand_srcStride, rand_dstStride;
 
-    pixel dstEvec[100 * 100];
-    pixel dstIvec[100 * 100];
-    pixel dstPvec[100 * 100];
+    pixel dstEvec[200 * 200];
+    pixel dstIvec[200 * 200];
+    pixel dstPvec[200 * 200];
 
-    pixel dstEref[100 * 100];
-    pixel dstIref[100 * 100];
-    pixel dstPref[100 * 100];
+    pixel dstEref[200 * 200];
+    pixel dstIref[200 * 200];
+    pixel dstPref[200 * 200];
 
-    memset(dstEref, 0, 10000 * sizeof(pixel));
-    memset(dstIref, 0, 10000 * sizeof(pixel));
-    memset(dstPref, 0, 10000 * sizeof(pixel));
+    memset(dstEref, 0, 40000 * sizeof(pixel));
+    memset(dstIref, 0, 40000 * sizeof(pixel));
+    memset(dstPref, 0, 40000 * sizeof(pixel));
 
-    memset(dstEvec, 0, 10000 * sizeof(pixel));
-    memset(dstIvec, 0, 10000 * sizeof(pixel));
-    memset(dstPvec, 0, 10000 * sizeof(pixel));
+    memset(dstEvec, 0, 40000 * sizeof(pixel));
+    memset(dstIvec, 0, 40000 * sizeof(pixel));
+    memset(dstPvec, 0, 40000 * sizeof(pixel));
     for (int i = 0; i <= 100; i++)
     {
-        rand_srcStride = 64;               // Can be randomly generated
-        rand_dstStride = 64;
+        rand_srcStride = 200;               // Can be randomly generated
+        rand_dstStride = 200;
 
         opt(8, short_buff + 8 * rand_srcStride,
             rand_srcStride,
-            dstEvec, dstIvec, dstPvec,
+            dstEvec + marginY * rand_dstStride + marginX, dstIvec + marginY * rand_dstStride + marginX, dstPvec + marginY * rand_dstStride + marginX,
             rand_dstStride,
             rand_width,
-            rand_height);
+            rand_height, marginX, marginY);
         ref(8, short_buff + 8 * rand_srcStride,
             rand_srcStride,
-            dstEref, dstIref, dstPref,
+            dstEref + marginY * rand_dstStride + marginX, dstIref + marginY * rand_dstStride + marginX, dstPref + marginY * rand_dstStride + marginX,
             rand_dstStride,
             rand_width,
-            rand_height);
+            rand_height, marginX, marginY);
 
-        if (memcmp(dstEvec, dstEref, 100 * 100 * sizeof(pixel))
-            || memcmp(dstIvec, dstIref, 100 * 100 * sizeof(pixel)) || memcmp(dstPvec, dstPref, 100 * 100 * sizeof(pixel)))
+        if (memcmp(dstEvec, dstEref, 200 * 200 * sizeof(pixel))
+            || memcmp(dstIvec, dstIref, 200 * 200 * sizeof(pixel)) || memcmp(dstPvec, dstPref, 200 * 200 * sizeof(pixel)))
         {
             return false;
         }
@@ -508,7 +510,7 @@
     {
         printf("Filter-V-multiplane\t");
         REPORT_SPEEDUP(opt.filterVmulti, ref.filterVmulti,
-                       8, short_buff + 8 * srcStride, srcStride, IPF_C_output_p, IPF_vec_output_p, IPF_C_output_p, dstStride, width, height);
+                       8, short_buff + 8 * srcStride, srcStride, IPF_C_output_p + 64 * 200 + 64, IPF_vec_output_p + 64 * 200 + 64, IPF_C_output_p + 64 * 200 + 64, dstStride, width, height, 64, 64);
     }
 
     if (opt.filterHmulti)
-------------- next part --------------
A non-text attachment was scrubbed...
Name: xhevc_deepthid.patch
Type: text/x-patch
Size: 24426 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20130620/6446f76d/attachment-0001.bin>