[x265] [PATCH 1 of 2] Merged buffer extension with Horizontal filter; integrated with encoder

Fri Jun 28 15:17:10 CEST 2013

# HG changeset patch
# User Deepthi Devaki
# Date 1372417533 -19800
# Node ID 7f1f8f3b77069aea10a3d4c54e0a8a9554e7bf19
# Parent  c79ed90edca573a569751842243331c588137836
Merged buffer extension with Horizontal filter; integrated with encoder

diff -r c79ed90edca5 -r 7f1f8f3b7706 source/common/ipfilter.cpp

--- a/source/common/ipfilter.cpp	Fri Jun 28 02:09:19 2013 -0500
+++ b/source/common/ipfilter.cpp	Fri Jun 28 16:35:33 2013 +0530
@@ -476,7 +476,7 @@
     xExtendPicCompBorder(dstP, dstStride, block_width, block_height, marginX, marginY);
 }
 
-void filterHorizontalMultiplane(int bitDepth, pixel *src, int srcStride, short *midF, short* midA, short* midB, short* midC, int midStride, pixel *pDstA, pixel *pDstB, pixel *pDstC, int pDstStride, int block_width, int block_height)
+void filterHorizontalMultiplaneExtend(int bitDepth, pixel *src, int srcStride, short *midF, short* midA, short* midB, short* midC, int midStride, pixel *pDstA, pixel *pDstB, pixel *pDstC, int pDstStride, int block_width, int block_height, int marginX, int marginY)
 {
     filterConvertPelToShort(bitDepth, src, srcStride, midF, midStride, block_width, block_height);
     filterHorizontal_pel_short<8>(bitDepth, src, srcStride, midB, midStride, block_width, block_height, TComInterpolationFilter::m_lumaFilter[2]);
@@ -485,6 +485,10 @@
     filterConvertShortToPel(bitDepth, midA, midStride, pDstA, pDstStride, block_width, block_height);
     filterConvertShortToPel(bitDepth, midB, midStride, pDstB, pDstStride, block_width, block_height);
     filterConvertShortToPel(bitDepth, midC, midStride, pDstC, pDstStride, block_width, block_height);
+
+    xExtendPicCompBorder(pDstA, pDstStride, block_width, block_height, marginX, marginY);
+    xExtendPicCompBorder(pDstB, pDstStride, block_width, block_height, marginX, marginY);
+    xExtendPicCompBorder(pDstC, pDstStride, block_width, block_height, marginX, marginY); 
 }
 }
 
@@ -516,6 +520,6 @@
     p.ipFilter_s_s[FILTER_V_S_S_4] = filterVertical_short_short<4>;
 
     p.filterVmulti = filterVerticalMultiplaneExtend;
-    p.filterHmulti = filterHorizontalMultiplane;
+    p.filterHmulti = filterHorizontalMultiplaneExtend;
 }
 }
diff -r c79ed90edca5 -r 7f1f8f3b7706 source/common/primitives.h
--- a/source/common/primitives.h	Fri Jun 28 02:09:19 2013 -0500
+++ b/source/common/primitives.h	Fri Jun 28 16:35:33 2013 +0530
@@ -210,7 +210,7 @@
 typedef void (*calcrecon_t)(pixel* piPred, short* piResi, pixel*  piReco, short* piRecQt, pixel *piRecIPred, int uiStride, int uiRecQtStride, int uiRecIPredStride);
 typedef void (*transpose_t)(pixel* pDst, pixel* pSrc, intptr_t nStride);
 typedef void (*filterVmulti_t)(int bitDepth, short *src, int srcStride, pixel *dstE, pixel *dstI, pixel *dstP, int dstStride, int block_width, int block_height, int marginX, int marginY);
-typedef void (*filterHmulti_t)(int bitDepth, pixel *src, int srcStride, short *midF, short* midA, short* midB, short* midC, int midStride, pixel *pDstA, pixel *pDstB, pixel *pDstC, int pDstStride, int block_width, int block_height);
+typedef void (*filterHmulti_t)(int bitDepth, pixel *src, int srcStride, short *midF, short* midA, short* midB, short* midC, int midStride, pixel *pDstA, pixel *pDstB, pixel *pDstC, int pDstStride, int block_width, int block_height, int marginX, int marginY);
 typedef void (*dequant_t)(int bitDepth, const int* pSrc, int* pDes, int iWidth, int iHeight, int mcqp_miper, int mcqp_mirem, bool useScalingList, unsigned int uiLog2TrSize, int *piDequantCoef);
 typedef uint32_t (*quantaq_t)(int *coef, int *quantCoeff, int *deltaU, int *qCoef, int *arlCCoef, int qBitsC, int qBits, int add, int numCoeff);
 typedef uint32_t (*quant_t)(int *coef, int *quantCoeff, int *deltaU, int *qCoef, int qBits, int add, int numCoeff);
diff -r c79ed90edca5 -r 7f1f8f3b7706 source/common/reference.cpp
--- a/source/common/reference.cpp	Fri Jun 28 02:09:19 2013 -0500
+++ b/source/common/reference.cpp	Fri Jun 28 16:35:33 2013 +0530
@@ -107,7 +107,7 @@
         primitives.filterHmulti(g_bitDepthY, srcPtr, m_lumaStride,                                     // source buffer
                                 intPtrF, intPtrA, intPtrB, intPtrC, m_intStride,                       // 4 intermediate HPEL buffers
                                 m_lumaPlane[1][0] + bufOffset, m_lumaPlane[2][0] + bufOffset, m_lumaPlane[3][0] + bufOffset, m_lumaStride, // 3 (x=n, y=0) output buffers (no V interp)
-                                m_filterWidth + (2 * s_intMarginX), m_filterHeight + (2 * s_intMarginY));
+                                m_filterWidth + (2 * s_intMarginX), m_filterHeight + (2 * s_intMarginY),  m_reconPic->m_iLumaMarginX - s_tmpMarginX - s_intMarginX, m_reconPic->m_iLumaMarginY - s_tmpMarginY - s_intMarginY);
     }
 
     if (!m_pool)
@@ -164,10 +164,10 @@
     pixel *dstPtr3 = m_lumaPlane[x][3] - s_tmpMarginY * m_lumaStride - s_tmpMarginX;
 
     primitives.filterVmulti(g_bitDepthY, intPtr, m_intStride, dstPtr1, dstPtr2, dstPtr3, m_lumaStride, m_filterWidth, m_filterHeight, m_reconPic->m_iLumaMarginX - s_tmpMarginX, m_reconPic->m_iLumaMarginY - s_tmpMarginY);
-    if (x > 0)
-    {
-        /* the Y=0 planes were not extended by the horizontal filter */
-        int bufOffset = -(s_tmpMarginY + s_intMarginY) * m_lumaStride - (s_tmpMarginX + s_intMarginX);
-        m_reconPic->xExtendPicCompBorder((Pel *)m_lumaPlane[x][0] + bufOffset, m_lumaStride, m_filterWidth + (2 * s_intMarginX), m_filterHeight + (2 * s_intMarginY), m_reconPic->m_iLumaMarginX - s_tmpMarginX - s_intMarginX, m_reconPic->m_iLumaMarginY - s_tmpMarginY - s_intMarginY);
-    }
+    //if (x > 0)
+    //{
+    //    /* the Y=0 planes were not extended by the horizontal filter */
+    //    int bufOffset = -(s_tmpMarginY + s_intMarginY) * m_lumaStride - (s_tmpMarginX + s_intMarginX);
+    //    m_reconPic->xExtendPicCompBorder((Pel *)m_lumaPlane[x][0] + bufOffset, m_lumaStride, m_filterWidth + (2 * s_intMarginX), m_filterHeight + (2 * s_intMarginY), m_reconPic->m_iLumaMarginX - s_tmpMarginX - s_intMarginX, m_reconPic->m_iLumaMarginY - s_tmpMarginY - s_intMarginY);
+    //}
 }
diff -r c79ed90edca5 -r 7f1f8f3b7706 source/common/vec/ipfilter.inc
--- a/source/common/vec/ipfilter.inc	Fri Jun 28 02:09:19 2013 -0500
+++ b/source/common/vec/ipfilter.inc	Fri Jun 28 16:35:33 2013 +0530
@@ -55,7 +55,7 @@
 
 #if !HIGH_BIT_DEPTH
     p.filterVmulti = filterVerticalMultiplaneExtend;
-    p.filterHmulti = filterHorizontalMultiplane;
+    p.filterHmulti = filterHorizontalMultiplaneExtend;
 #endif
 }
 }
diff -r c79ed90edca5 -r 7f1f8f3b7706 source/common/vec/ipfilter8.inc
--- a/source/common/vec/ipfilter8.inc	Fri Jun 28 02:09:19 2013 -0500
+++ b/source/common/vec/ipfilter8.inc	Fri Jun 28 16:35:33 2013 +0530
@@ -773,8 +773,8 @@
     }
 }
 
-void filterHorizontalMultiplane(int /*bitDepth*/, pixel *src, int srcStride, short *dstF, short* dstA, short* dstB, short* dstC, int dstStride, pixel *pDstA, pixel *pDstB, pixel *pDstC, int pDstStride, int block_width, int block_height)
-{
+#if INSTRSET < 5
+void filterHorizontalMultiplaneExtend(int /*bitDepth*/, pixel *src, int srcStride, short *dstF, short* dstA, short* dstB, short* dstC, int dstStride, pixel *pDstA, pixel *pDstB, pixel *pDstC, int pDstStride, int block_width, int block_height, int marginX, int marginY){
     int row, col;
 
     src -= (8 / 2 - 1);
@@ -789,27 +789,22 @@
     Vec8s vec_offset(offset);
     Vec8s sumaL, sumaH, sumbL, sumbH, sumcL, sumcH, tmp, exp1;
     Vec8s valL, valH;
-    // Load Ai, ai += Ai*coefi
+    Vec16uc tmp16a, tmp16b, tmp16c;
+    // Load Ai, ai += Ai*coefi      
 
     for (row = 0; row < block_height; row++)
     {
         col = 0;
 
-        for (; col + 16 <= (block_width); col += 16)               // Iterations multiple of 8
-        {
             vec_src0.load(src + col);
             sumbL = -(extend_low(vec_src0));
-            sumbH = -(extend_high(vec_src0));
 
             // a = b+=4*a1,  c+=1*a1
             vec_src0.load(src + col + 1);                       // Load the 8 elements
             sumcL = extend_low(vec_src0);
             sumbL += (sumcL << 2);
             sumaL = sumbL;
-            sumcH = extend_high(vec_src0);
-            sumbH += (sumcH << 2);
-            sumaH = sumbH;
-
+ 
             // a +=-10*a2    b+=-11*a2      c+=-5*a2
             vec_src0.load(src + col + 2);
             tmp = extend_low(vec_src0);
@@ -819,13 +814,6 @@
             tmp <<= 1;
             sumaL += tmp;
             sumbL += tmp;
-            tmp = extend_high(vec_src0);
-            sumbH -= tmp;
-            tmp *= (-5);
-            sumcH += tmp;
-            tmp <<= 1;
-            sumaH += tmp;
-            sumbH += tmp;
 
             // a +=58*a3    b+=40*a3      c+=17*a3
             vec_src0.load(src + col + 3);
@@ -837,15 +825,7 @@
             tmp *= 40;
             sumbL += tmp;
             sumaL += (tmp + exp1);
-            tmp = extend_high(vec_src0);
-            ((tmp << 6) - IF_INTERNAL_OFFS).store(dstF + col + 8);    // storing A as short into intermediate buffer
-            exp1 = (tmp << 4) + tmp;
-            sumcH += exp1;
-            sumaH += tmp;
-            tmp *= 40;
-            sumbH += tmp;
-            sumaH += (tmp + exp1);
-
+ 
             // a +=17*a4    b+=40*a4      c+=58*a4
             vec_src0.load(src + col + 4);
             tmp = extend_low(vec_src0);
@@ -855,13 +835,108 @@
             tmp *= 40;
             sumbL += tmp;
             sumcL += (tmp + exp1);
-            tmp = extend_high(vec_src0);
+ 
+            // a +=-5*a5    b+=-11*a5      c+=-10*a5
+            vec_src0.load(src + col + 5);
+            tmp = extend_low(vec_src0);
+            sumbL -= tmp;
+            tmp *= (-5);
+            sumaL += tmp;
+            tmp <<= 1;
+            sumcL += tmp;
+            sumbL += tmp;
+
+            // a +=1*a6    b+=4*a6      c+=4*a6
+            vec_src0.load(src + col + 6);
+            tmp = extend_low(vec_src0);
+            sumaL += tmp;
+            tmp <<= 2;
+            sumbL += tmp;
+            sumcL += tmp;
+
+            // a +=0*a7    b+=-1*a7      c+=-1*a7
+            vec_src0.load(src + col + 7);
+            tmp = extend_low(vec_src0);
+            sumbL -= tmp;
+            sumcL -= tmp;
+            sumaL = (sumaL + vec_offset);               // Add offset to sum_low
+            sumbL = (sumbL + vec_offset);
+            sumcL = (sumcL + vec_offset);
+
+            sumaL.store(dstA + col);                             // Store vector
+            sumaL = (sumaL + IF_INTERNAL_OFFS + 32) >> 6;
+            tmp16a = compress_unsafe(sumaL, sumaL);                            // Its certainly safe, just a misnomer
+            tmp16a.store_partial(8,pDstA + row * pDstStride + col);
+
+            sumbL.store(dstB + col);
+            sumbL = (sumbL + IF_INTERNAL_OFFS + 32) >> 6;
+            tmp16b = compress_unsafe(sumbL, sumbL);
+            tmp16b.store_partial(8,pDstB + row * pDstStride + col);
+
+            sumcL.store(dstC + col);
+            sumcL = (sumcL + IF_INTERNAL_OFFS + 32) >> 6;
+            tmp16c = compress_unsafe(sumcL, sumcL);
+            tmp16c.store_partial(8,pDstC + row * pDstStride + col);
+        
+            //Extend First column
+        __m128i ma, mb, mc;
+        ma = permute16uc<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>((Vec16uc)tmp16a); 
+        mb = permute16uc<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>((Vec16uc)tmp16b); 
+        mc = permute16uc<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>((Vec16uc)tmp16c); 
+
+        for (int i = -marginX; i < -16; i += 16) 
+        { 
+            _mm_storeu_si128((__m128i*)(pDstA + row * pDstStride +  i), ma);
+            _mm_storeu_si128((__m128i*)(pDstB + row * pDstStride +  i), mb); 
+            _mm_storeu_si128((__m128i*)(pDstC + row * pDstStride +  i), mc); 
+        } 
+        _mm_storeu_si128((__m128i*)(pDstA + row * pDstStride - 16), ma); /*Assuming marginX > 16*/ 
+        _mm_storeu_si128((__m128i*)(pDstB + row * pDstStride - 16), mb); 
+        _mm_storeu_si128((__m128i*)(pDstC + row * pDstStride - 16), mc); 
+
+         col+=8;
+
+        for (; col + 8/*16*/ <= (block_width); col += 8/*16*/)               // Iterations multiple of 8
+        {
+            vec_src0.load(src + col);
+            sumbL = -(extend_low(vec_src0));
+
+            // a = b+=4*a1,  c+=1*a1
+            vec_src0.load(src + col + 1);                       // Load the 8 elements
+            sumcL = extend_low(vec_src0);
+            sumbL += (sumcL << 2);
+            sumaL = sumbL;
+
+            // a +=-10*a2    b+=-11*a2      c+=-5*a2
+            vec_src0.load(src + col + 2);
+            tmp = extend_low(vec_src0);
+            sumbL -= tmp;
+            tmp *= (-5);
+            sumcL += tmp;
+            tmp <<= 1;
+            sumaL += tmp;
+            sumbL += tmp;
+ 
+            // a +=58*a3    b+=40*a3      c+=17*a3
+            vec_src0.load(src + col + 3);
+            tmp = extend_low(vec_src0);
+            ((tmp << 6) - IF_INTERNAL_OFFS).store(dstF + col);    // storing A as short into intermediate buffer
             exp1 = (tmp << 4) + tmp;
-            sumaH += exp1;
-            sumcH += tmp;
+            sumcL += exp1;
+            sumaL += tmp;
             tmp *= 40;
-            sumbH += tmp;
-            sumcH += (tmp + exp1);
+            sumbL += tmp;
+            sumaL += (tmp + exp1);
+ 
+            // a +=17*a4    b+=40*a4      c+=58*a4
+            vec_src0.load(src + col + 4);
+            tmp = extend_low(vec_src0);
+            exp1 = (tmp << 4) + tmp;
+            sumaL += exp1;
+            sumcL += tmp;
+            tmp *= 40;
+            sumbL += tmp;
+            sumcL += (tmp + exp1);
 
             // a +=-5*a5    b+=-11*a5      c+=-10*a5
             vec_src0.load(src + col + 5);
@@ -872,14 +947,7 @@
             tmp <<= 1;
             sumcL += tmp;
             sumbL += tmp;
-            tmp = extend_high(vec_src0);
-            sumbH -= tmp;
-            tmp *= (-5);
-            sumaH += tmp;
-            tmp <<= 1;
-            sumcH += tmp;
-            sumbH += tmp;
-
+  
             // a +=1*a6    b+=4*a6      c+=4*a6
             vec_src0.load(src + col + 6);
             tmp = extend_low(vec_src0);
@@ -887,11 +955,6 @@
             tmp <<= 2;
             sumbL += tmp;
             sumcL += tmp;
-            tmp = extend_high(vec_src0);
-            sumaH += tmp;
-            tmp <<= 2;
-            sumbH += tmp;
-            sumcH += tmp;
 
             // a +=0*a7    b+=-1*a7      c+=-1*a7
             vec_src0.load(src + col + 7);
@@ -901,43 +964,31 @@
             sumaL = (sumaL + vec_offset);               // Add offset to sum_low
             sumbL = (sumbL + vec_offset);
             sumcL = (sumcL + vec_offset);
-            tmp = extend_high(vec_src0);
-            sumbH -= tmp;
-            sumcH -= tmp;
-            sumaH = (sumaH + vec_offset);
-            sumbH = (sumbH + vec_offset);
-            sumcH = (sumcH + vec_offset);
 
             sumaL.store(dstA + col);                             // Store vector
-            sumaH.store(dstA + col + 8);                             // Store vector
-            valL = (sumaL + IF_INTERNAL_OFFS + 32) >> 6;
-            valH = (sumaH + IF_INTERNAL_OFFS + 32) >> 6;
-            compress_unsafe(valL, valH).store(pDstA + row * pDstStride + col);
+            sumaL = (sumaL + IF_INTERNAL_OFFS + 32) >> 6;
+            tmp16a = compress_unsafe(sumaL, sumaL);
+            tmp16a.store_partial(8,pDstA + row * pDstStride + col);
 
             sumbL.store(dstB + col);
-            sumbH.store(dstB + col + 8);
-            valL = (sumbL + IF_INTERNAL_OFFS + 32) >> 6;
-            valH = (sumbH + IF_INTERNAL_OFFS + 32) >> 6;
-            compress_unsafe(valL, valH).store(pDstB + row * pDstStride + col);
+            sumbL = (sumbL + IF_INTERNAL_OFFS + 32) >> 6;
+            tmp16b = compress_unsafe(sumbL, sumbL);
+            tmp16b.store_partial(8,pDstB + row * pDstStride + col);
 
             sumcL.store(dstC + col);
-            sumcH.store(dstC + col + 8);
-            valL = (sumcL + IF_INTERNAL_OFFS + 32) >> 6;
-            valH = (sumcH + IF_INTERNAL_OFFS + 32) >> 6;
-            compress_unsafe(valL, valH).store(pDstC + row * pDstStride + col);
+            sumcL = (sumcL + IF_INTERNAL_OFFS + 32) >> 6;
+            tmp16c = compress_unsafe(sumcL, sumcL);
+            tmp16c.store_partial(8,pDstC + row * pDstStride + col);
+
         }
 
         if (block_width - col > 0)
         {
-            vec_src0.load(src + block_width - 13);
-            if (block_width - col > 8)
-            {
-                tmp = extend_low(vec_src0);
-                ((tmp << 6) - IF_INTERNAL_OFFS).store(dstF + block_width - 16);
-            }
-            tmp = extend_high(vec_src0);
+            vec_src0.load(src + block_width - 5);
+            tmp = extend_low(vec_src0);
             ((tmp << 6) - IF_INTERNAL_OFFS).store(dstF + block_width - 8);
 
+            short vala, valb, valc;
             for (; col < block_width; col++)                           // Remaining iterations
             {
                 vec_src0.load(src + col);
@@ -945,9 +996,9 @@
                 int isuma = horizontal_add(tmp * Vec8s(-1, 4, -10, 58, 17,  -5, 1,  0));
                 int isumb = horizontal_add(tmp * Vec8s(-1, 4, -11, 40, 40, -11, 4, -1));
                 int isumc = horizontal_add(tmp * Vec8s(0, 1,  -5, 17, 58, -10, 4, -1));
-                short vala = (short)(isuma + offset) >> shift;
-                short valb = (short)(isumb + offset) >> shift;
-                short valc = (short)(isumc + offset) >> shift;
+                vala = (short)(isuma + offset) >> shift;
+                valb = (short)(isumb + offset) >> shift;
+                valc = (short)(isumc + offset) >> shift;
                 dstA[col] = vala;
                 vala = (vala + IF_INTERNAL_OFFS + 32) >> 6;
                 if (vala < 0) vala = 0;
@@ -967,14 +1018,359 @@
                     valc = 255;
                 pDstC[row * pDstStride + col] = (pixel)valc;
             }
+            tmp16a = Vec16uc(vala);
+            tmp16b = Vec16uc(valb);
+            tmp16c = Vec16uc(valc);
         }
+        else
+        {
+            tmp16a = permute16uc<15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15>((Vec16uc)tmp16a); 
+            tmp16b = permute16uc<15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15>((Vec16uc)tmp16b); 
+            tmp16c = permute16uc<15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15>((Vec16uc)tmp16c); 
+        }
+        //Extend last column
+        for (int i = -marginX; i < -16; i += 16) 
+        { 
+            _mm_storeu_si128((__m128i*)(pDstA + row * pDstStride + block_width + marginX + i), tmp16a);
+            _mm_storeu_si128((__m128i*)(pDstB + row * pDstStride + block_width + marginX + i), tmp16b); 
+            _mm_storeu_si128((__m128i*)(pDstC + row * pDstStride + block_width + marginX + i), tmp16c); 
+        } 
+        _mm_storeu_si128((__m128i*)(pDstA + row * pDstStride + block_width + marginX - 16), tmp16a); /*Assuming marginX > 16*/ 
+        _mm_storeu_si128((__m128i*)(pDstB + row * pDstStride + block_width + marginX - 16), tmp16b); 
+        _mm_storeu_si128((__m128i*)(pDstC + row * pDstStride + block_width + marginX - 16), tmp16c); 
+
         src += srcStride;
         dstF += dstStride;
         dstA += dstStride;
         dstB += dstStride;
         dstC += dstStride;
     }
+
+    // Extending bottom rows
+    pixel *pe, *pi, *pp;
+    pe = pDstA + (block_height - 1) * pDstStride - marginX;
+    pi = pDstB + (block_height - 1) * pDstStride - marginX;
+    pp = pDstC + (block_height - 1) * pDstStride - marginX;
+    for (int y = 1; y <= marginY; y++)
+        memcpy(pe + y * pDstStride, pe, block_width + marginX * 2);
+    for (int y = 1; y <= marginY; y++)
+        memcpy(pi + y * pDstStride, pi, block_width + marginX * 2);
+    for (int y = 1; y <= marginY; y++)
+        memcpy(pp + y * pDstStride, pp, block_width + marginX * 2);
+
+    // Extending top rows
+    pe -= ((block_height - 1) * dstStride);
+    pi -= ((block_height - 1) * dstStride);
+    pp -= ((block_height - 1) * dstStride);
+    for (int y = 1; y <= marginY; y++)
+        memcpy(pe - y * pDstStride, pe, block_width + marginX * 2);
+    for (int y = 1; y <= marginY; y++)
+        memcpy(pi - y * pDstStride, pi, block_width + marginX * 2);
+    for (int y = 1; y <= marginY; y++)
+        memcpy(pp - y * pDstStride, pp, block_width + marginX * 2);
 }
+#else
+void filterHorizontalMultiplaneExtend(int /*bitDepth*/, pixel *src, int srcStride, short *dstF, short* dstA, short* dstB, short* dstC, int dstStride, pixel *pDstA, pixel *pDstB, pixel *pDstC, int pDstStride, int block_width, int block_height, int marginX, int marginY)
+{
+    int row, col;
+
+    src -= (8 / 2 - 1);
+    int offset;
+    int headRoom = IF_INTERNAL_PREC - 8;
+    int shift = IF_FILTER_PREC;
+    shift -= headRoom;
+    offset = -IF_INTERNAL_OFFS << shift;
+
+    __m128i vec_src0;
+    __m128i vec_offset = _mm_set1_epi16(offset);
+    __m128i sumaL, sumbL, sumcL, tmp, exp1;
+    __m128i tmp16a, tmp16b, tmp16c;
+
+    // Load Ai, ai += Ai*coefi      
+    for (row = 0; row < block_height; row++)
+    {
+        col = 0;
+
+            vec_src0 = _mm_loadu_si128((__m128i const*)(src + col));
+            sumbL = (_mm_unpacklo_epi8(vec_src0,_mm_setzero_si128()));
+            sumbL = _mm_sub_epi16(_mm_setzero_si128(), sumbL);
+
+            // a = b+=4*a1,  c+=1*a1
+            vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 1));
+            sumcL = _mm_unpacklo_epi8(vec_src0,_mm_setzero_si128());
+            sumbL = _mm_add_epi16(sumbL, _mm_sll_epi16(sumcL,_mm_cvtsi32_si128(2)));
+            sumaL = sumbL;
+ 
+            // a +=-10*a2    b+=-11*a2      c+=-5*a2
+            vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 2));
+            tmp = _mm_unpacklo_epi8(vec_src0,_mm_setzero_si128());
+            sumbL = _mm_sub_epi16(sumbL, tmp);
+            tmp = _mm_mullo_epi16(tmp, _mm_set1_epi16(-5));
+            sumcL = _mm_add_epi16(sumcL, tmp);
+            tmp = _mm_sll_epi16(tmp,_mm_cvtsi32_si128(1));
+            sumaL = _mm_add_epi16(sumaL, tmp);
+            sumbL = _mm_add_epi16(sumbL, tmp);
+
+
+            // a +=58*a3    b+=40*a3      c+=17*a3
+            vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 3));
+            tmp = _mm_unpacklo_epi8(vec_src0,_mm_setzero_si128());
+            _mm_storeu_si128((__m128i*)(dstF+col),_mm_sub_epi16(_mm_sll_epi16(tmp,_mm_cvtsi32_si128(6)), _mm_set1_epi16(IF_INTERNAL_OFFS)));
+            exp1 = _mm_add_epi16(tmp, _mm_sll_epi16(tmp,_mm_cvtsi32_si128(4)));
+            sumcL = _mm_add_epi16(sumcL, exp1);
+            sumaL = _mm_add_epi16(sumaL, tmp);
+            tmp = _mm_mullo_epi16(tmp, _mm_set1_epi16(40));
+            sumbL = _mm_add_epi16(sumbL, tmp);
+            sumaL = _mm_add_epi16(sumaL, _mm_add_epi16(exp1, tmp));
+ 
+            // a +=17*a4    b+=40*a4      c+=58*a4
+            vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 4));
+            tmp = _mm_unpacklo_epi8(vec_src0,_mm_setzero_si128());
+            exp1 = _mm_add_epi16(tmp, _mm_sll_epi16(tmp,_mm_cvtsi32_si128(4)));
+            sumaL = _mm_add_epi16(sumaL, exp1);
+            sumcL = _mm_add_epi16(sumcL, tmp);
+            tmp = _mm_mullo_epi16(tmp, _mm_set1_epi16(40));
+            sumbL = _mm_add_epi16(sumbL, tmp);
+            sumcL = _mm_add_epi16(sumcL, _mm_add_epi16(exp1, tmp));
+ 
+            // a +=-5*a5    b+=-11*a5      c+=-10*a5
+            vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 5));
+            tmp = _mm_unpacklo_epi8(vec_src0,_mm_setzero_si128());
+            sumbL = _mm_sub_epi16(sumbL, tmp);
+            tmp = _mm_mullo_epi16(tmp, _mm_set1_epi16(-5));
+            sumaL = _mm_add_epi16(sumaL, tmp);
+            tmp =_mm_sll_epi16(tmp,_mm_cvtsi32_si128(1));
+            sumcL = _mm_add_epi16(sumcL, tmp);
+            sumbL = _mm_add_epi16(sumbL, tmp);
+
+            // a +=1*a6    b+=4*a6      c+=4*a6
+            vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 6));
+            tmp = _mm_unpacklo_epi8(vec_src0,_mm_setzero_si128());
+            sumaL = _mm_add_epi16(sumaL, tmp);
+            tmp = _mm_sll_epi16(tmp,_mm_cvtsi32_si128(2));
+            sumbL = _mm_add_epi16(sumbL, tmp);
+            sumcL = _mm_add_epi16(sumcL, tmp);
+
+            // a +=0*a7    b+=-1*a7      c+=-1*a7
+            vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 7));
+            tmp = _mm_unpacklo_epi8(vec_src0,_mm_setzero_si128());
+            sumbL = _mm_sub_epi16(sumbL, tmp);
+            sumcL = _mm_sub_epi16(sumcL, tmp);
+            sumaL = _mm_add_epi16(sumaL, vec_offset);
+            sumbL = _mm_add_epi16(sumbL, vec_offset);
+            sumcL = _mm_add_epi16(sumcL, vec_offset);
+
+            _mm_storeu_si128((__m128i*)(dstA+col),sumaL);
+            sumaL = _mm_add_epi16(sumaL, _mm_set1_epi16(IF_INTERNAL_OFFS + 32));
+            sumaL = _mm_sra_epi16(sumaL,_mm_cvtsi32_si128(6));
+            tmp16a = _mm_packus_epi16(sumaL,sumaL);            
+            _mm_storel_epi64((__m128i*)(pDstA + row * pDstStride + col),tmp16a);
+
+            _mm_storeu_si128((__m128i*)(dstB+col),sumbL);
+            sumbL = _mm_add_epi16(sumbL, _mm_set1_epi16(IF_INTERNAL_OFFS + 32));
+            sumbL = _mm_sra_epi16(sumbL,_mm_cvtsi32_si128(6));
+            tmp16b = _mm_packus_epi16(sumbL,sumbL);            
+            _mm_storel_epi64((__m128i*)(pDstB + row * pDstStride + col),tmp16b);
+
+            _mm_storeu_si128((__m128i*)(dstC+col),sumcL);
+            sumcL = _mm_add_epi16(sumcL, _mm_set1_epi16(IF_INTERNAL_OFFS + 32));
+            sumcL = _mm_sra_epi16(sumcL,_mm_cvtsi32_si128(6));
+            tmp16c = _mm_packus_epi16(sumcL,sumcL);            
+            _mm_storel_epi64((__m128i*)(pDstC + row * pDstStride + col),tmp16c);
+
+            //Extend First column
+        __m128i ma, mb, mc;
+        ma = _mm_shuffle_epi8(tmp16a , _mm_set1_epi8(0)); 
+        mb = _mm_shuffle_epi8(tmp16b , _mm_set1_epi8(0)); 
+        mc = _mm_shuffle_epi8(tmp16c , _mm_set1_epi8(0)); 
+
+        for (int i = -marginX; i < -16; i += 16) 
+        { 
+            _mm_storeu_si128((__m128i*)(pDstA + row * pDstStride +  i), ma);
+            _mm_storeu_si128((__m128i*)(pDstB + row * pDstStride +  i), mb); 
+            _mm_storeu_si128((__m128i*)(pDstC + row * pDstStride +  i), mc); 
+        } 
+        _mm_storeu_si128((__m128i*)(pDstA + row * pDstStride - 16), ma); /*Assuming marginX > 16*/ 
+        _mm_storeu_si128((__m128i*)(pDstB + row * pDstStride - 16), mb); 
+        _mm_storeu_si128((__m128i*)(pDstC + row * pDstStride - 16), mc); 
+
+         col+=8;
+
+        for (; col + 8/*16*/ <= (block_width); col += 8/*16*/)               // Iterations multiple of 8
+        {
+            vec_src0 = _mm_loadu_si128((__m128i const*)(src + col));
+            sumbL = (_mm_unpacklo_epi8(vec_src0,_mm_setzero_si128()));
+            sumbL = _mm_sub_epi16(_mm_setzero_si128(), sumbL);
+
+            // a = b+=4*a1,  c+=1*a1
+            vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 1));
+            sumcL = _mm_unpacklo_epi8(vec_src0,_mm_setzero_si128());
+            sumbL = _mm_add_epi16(sumbL, _mm_sll_epi16(sumcL,_mm_cvtsi32_si128(2)));
+            sumaL = sumbL;
+ 
+            // a +=-10*a2    b+=-11*a2      c+=-5*a2
+            vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 2));
+            tmp = _mm_unpacklo_epi8(vec_src0,_mm_setzero_si128());
+            sumbL = _mm_sub_epi16(sumbL, tmp);
+            tmp = _mm_mullo_epi16(tmp, _mm_set1_epi16(-5));
+            sumcL = _mm_add_epi16(sumcL, tmp);
+            tmp = _mm_sll_epi16(tmp,_mm_cvtsi32_si128(1));
+            sumaL = _mm_add_epi16(sumaL, tmp);
+            sumbL = _mm_add_epi16(sumbL, tmp);
+
+
+            // a +=58*a3    b+=40*a3      c+=17*a3
+            vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 3));
+            tmp = _mm_unpacklo_epi8(vec_src0,_mm_setzero_si128());
+            _mm_storeu_si128((__m128i*)(dstF+col),_mm_sub_epi16(_mm_sll_epi16(tmp,_mm_cvtsi32_si128(6)), _mm_set1_epi16(IF_INTERNAL_OFFS)));
+            exp1 = _mm_add_epi16(tmp, _mm_sll_epi16(tmp,_mm_cvtsi32_si128(4)));
+            sumcL = _mm_add_epi16(sumcL, exp1);
+            sumaL = _mm_add_epi16(sumaL, tmp);
+            tmp = _mm_mullo_epi16(tmp, _mm_set1_epi16(40));
+            sumbL = _mm_add_epi16(sumbL, tmp);
+            sumaL = _mm_add_epi16(sumaL, _mm_add_epi16(exp1, tmp));
+ 
+            // a +=17*a4    b+=40*a4      c+=58*a4
+            vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 4));
+            tmp = _mm_unpacklo_epi8(vec_src0,_mm_setzero_si128());
+            exp1 = _mm_add_epi16(tmp, _mm_sll_epi16(tmp,_mm_cvtsi32_si128(4)));
+            sumaL = _mm_add_epi16(sumaL, exp1);
+            sumcL = _mm_add_epi16(sumcL, tmp);
+            tmp = _mm_mullo_epi16(tmp, _mm_set1_epi16(40));
+            sumbL = _mm_add_epi16(sumbL, tmp);
+            sumcL = _mm_add_epi16(sumcL, _mm_add_epi16(exp1, tmp));
+ 
+            // a +=-5*a5    b+=-11*a5      c+=-10*a5
+            vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 5));
+            tmp = _mm_unpacklo_epi8(vec_src0,_mm_setzero_si128());
+            sumbL = _mm_sub_epi16(sumbL, tmp);
+            tmp = _mm_mullo_epi16(tmp, _mm_set1_epi16(-5));
+            sumaL = _mm_add_epi16(sumaL, tmp);
+            tmp =_mm_sll_epi16(tmp,_mm_cvtsi32_si128(1));
+            sumcL = _mm_add_epi16(sumcL, tmp);
+            sumbL = _mm_add_epi16(sumbL, tmp);
+
+            // a +=1*a6    b+=4*a6      c+=4*a6
+            vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 6));
+            tmp = _mm_unpacklo_epi8(vec_src0,_mm_setzero_si128());
+            sumaL = _mm_add_epi16(sumaL, tmp);
+            tmp = _mm_sll_epi16(tmp,_mm_cvtsi32_si128(2));
+            sumbL = _mm_add_epi16(sumbL, tmp);
+            sumcL = _mm_add_epi16(sumcL, tmp);
+
+            // a +=0*a7    b+=-1*a7      c+=-1*a7
+            vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 7));
+            tmp = _mm_unpacklo_epi8(vec_src0,_mm_setzero_si128());
+            sumbL = _mm_sub_epi16(sumbL, tmp);
+            sumcL = _mm_sub_epi16(sumcL, tmp);
+            sumaL = _mm_add_epi16(sumaL, vec_offset);
+            sumbL = _mm_add_epi16(sumbL, vec_offset);
+            sumcL = _mm_add_epi16(sumcL, vec_offset);
+
+            _mm_storeu_si128((__m128i*)(dstA+col),sumaL);
+            sumaL = _mm_add_epi16(sumaL, _mm_set1_epi16(IF_INTERNAL_OFFS + 32));
+            sumaL = _mm_sra_epi16(sumaL,_mm_cvtsi32_si128(6));
+            tmp16a = _mm_packus_epi16(sumaL,sumaL);            
+            _mm_storel_epi64((__m128i*)(pDstA + row * pDstStride + col),tmp16a);
+
+            _mm_storeu_si128((__m128i*)(dstB+col),sumbL);
+            sumbL = _mm_add_epi16(sumbL, _mm_set1_epi16(IF_INTERNAL_OFFS + 32));
+            sumbL = _mm_sra_epi16(sumbL,_mm_cvtsi32_si128(6));
+            tmp16b = _mm_packus_epi16(sumbL,sumbL);            
+            _mm_storel_epi64((__m128i*)(pDstB + row * pDstStride + col),tmp16b);
+
+            _mm_storeu_si128((__m128i*)(dstC+col),sumcL);
+            sumcL = _mm_add_epi16(sumcL, _mm_set1_epi16(IF_INTERNAL_OFFS + 32));
+            sumcL = _mm_sra_epi16(sumcL,_mm_cvtsi32_si128(6));
+            tmp16c = _mm_packus_epi16(sumcL,sumcL);            
+            _mm_storel_epi64((__m128i*)(pDstC + row * pDstStride + col),tmp16c);
+
+        }
+
+        if (block_width - col > 0)
+        {
+            vec_src0 = _mm_loadu_si128((__m128i const*)(src + block_width - 5));
+            tmp = _mm_unpacklo_epi8(vec_src0,_mm_setzero_si128());
+            _mm_storeu_si128((__m128i*)(dstF + block_width - 8),_mm_sub_epi16(_mm_sll_epi16(tmp,_mm_cvtsi32_si128(6)), _mm_set1_epi16(IF_INTERNAL_OFFS)));
+            __m128i a, b, c, sum1, sum2, sum3=_mm_setzero_si128();
+            for (; col < block_width; col++)                           // Remaining iterations
+            {
+                vec_src0 = _mm_loadu_si128((__m128i const*)(src + col));
+                tmp = _mm_unpacklo_epi8(vec_src0,_mm_setzero_si128());    // Assuming that there is no overflow (Everywhere in this function!)
+                a = _mm_setr_epi16(-1, 4, -10, 58, 17,  -5, 1,  0);
+                a = _mm_mullo_epi16(tmp, a);
+                b = _mm_setr_epi16(-1, 4, -11, 40, 40, -11, 4, -1);
+                b = _mm_mullo_epi16(tmp, b);
+                c = _mm_setr_epi16(0, 1,  -5, 17, 58, -10, 4, -1);
+                c = _mm_mullo_epi16(tmp, c);
+                sum1  = _mm_hadd_epi16(a,b);                   // horizontally add 8 elements in 3 steps
+                sum2  = _mm_hadd_epi16(c,c);
+                sum2  = _mm_hadd_epi16(sum1,sum2);
+                sum3  = _mm_hadd_epi16(sum2,sum2);
+                sum3  = _mm_add_epi16(sum3, vec_offset);
+                sum3  = _mm_sra_epi16(sum3,_mm_cvtsi32_si128(shift));
+                dstA[col] = _mm_cvtsi128_si32(sum3);  
+                dstB[col] = _mm_extract_epi16(sum3, 1);
+                dstC[col] = _mm_extract_epi16(sum3, 2);
+                sum3 = _mm_add_epi16(sum3, _mm_set1_epi16(IF_INTERNAL_OFFS + 32));
+                sum3 = _mm_sra_epi16(sum3,_mm_cvtsi32_si128(6));
+                sum3 = _mm_packus_epi16(sum3, sum3);
+                pDstA[row * pDstStride + col] = _mm_extract_epi8(sum3, 0);
+                pDstB[row * pDstStride + col] = _mm_extract_epi8(sum3, 1);
+                pDstC[row * pDstStride + col] = _mm_extract_epi8(sum3, 2);
+            }
+            tmp16a = _mm_shuffle_epi8(sum3 , _mm_set1_epi8(0)); 
+            tmp16b = _mm_shuffle_epi8(sum3 , _mm_set1_epi8(1)); 
+            tmp16c = _mm_shuffle_epi8(sum3 , _mm_set1_epi8(2)); 
+        }
+        else
+        {
+            tmp16a = _mm_shuffle_epi8(tmp16a , _mm_set1_epi8(15)); 
+            tmp16b = _mm_shuffle_epi8(tmp16b , _mm_set1_epi8(15)); 
+            tmp16c = _mm_shuffle_epi8(tmp16c , _mm_set1_epi8(15)); 
+        }
+        //Extend last column
+        for (int i = -marginX; i < -16; i += 16) 
+        { 
+            _mm_storeu_si128((__m128i*)(pDstA + row * pDstStride + block_width + marginX + i), tmp16a);
+            _mm_storeu_si128((__m128i*)(pDstB + row * pDstStride + block_width + marginX + i), tmp16b); 
+            _mm_storeu_si128((__m128i*)(pDstC + row * pDstStride + block_width + marginX + i), tmp16c); 
+        } 
+        _mm_storeu_si128((__m128i*)(pDstA + row * pDstStride + block_width + marginX - 16), tmp16a); /*Assuming marginX > 16*/ 
+        _mm_storeu_si128((__m128i*)(pDstB + row * pDstStride + block_width + marginX - 16), tmp16b); 
+        _mm_storeu_si128((__m128i*)(pDstC + row * pDstStride + block_width + marginX - 16), tmp16c); 
+
+        src += srcStride;
+        dstF += dstStride;
+        dstA += dstStride;
+        dstB += dstStride;
+        dstC += dstStride;
+    }
+
+    // Extending bottom rows
+    pixel *pe, *pi, *pp;
+    pe = pDstA + (block_height - 1) * pDstStride - marginX;
+    pi = pDstB + (block_height - 1) * pDstStride - marginX;
+    pp = pDstC + (block_height - 1) * pDstStride - marginX;
+    for (int y = 1; y <= marginY; y++)
+        memcpy(pe + y * pDstStride, pe, block_width + marginX * 2);
+    for (int y = 1; y <= marginY; y++)
+        memcpy(pi + y * pDstStride, pi, block_width + marginX * 2);
+    for (int y = 1; y <= marginY; y++)
+        memcpy(pp + y * pDstStride, pp, block_width + marginX * 2);
+
+    // Extending top rows
+    pe -= ((block_height - 1) * dstStride);
+    pi -= ((block_height - 1) * dstStride);
+    pp -= ((block_height - 1) * dstStride);
+    for (int y = 1; y <= marginY; y++)
+        memcpy(pe - y * pDstStride, pe, block_width + marginX * 2);
+    for (int y = 1; y <= marginY; y++)
+        memcpy(pi - y * pDstStride, pi, block_width + marginX * 2);
+    for (int y = 1; y <= marginY; y++)
+        memcpy(pp - y * pDstStride, pp, block_width + marginX * 2);
+}
+#endif
 
 template<int N>
 void filterHorizontal_pel_short(int bitDepth, pixel *src, int srcStride, short *dst, int dstStride, int block_width, int block_height, short const *coeff)
diff -r c79ed90edca5 -r 7f1f8f3b7706 source/test/ipfilterharness.cpp
--- a/source/test/ipfilterharness.cpp	Fri Jun 28 02:09:19 2013 -0500
+++ b/source/test/ipfilterharness.cpp	Fri Jun 28 16:35:33 2013 +0530
@@ -313,9 +313,10 @@
 
 bool IPFilterHarness::check_filterHMultiplane(x265::filterHmulti_t ref, x265::filterHmulti_t opt)
 {
-    short rand_height = 32 + 9;                 // Can be randomly generated Height
-    short rand_width = 32 + 15;                  // Can be randomly generated Width
-    short rand_srcStride, rand_dstStride;
+    short rand_height;
+    short rand_width;
+    int rand_srcStride, rand_dstStride;
+    int marginX, marginY;
 
     short dstAvec[100 * 100];
     short dstEvec[100 * 100];
@@ -325,12 +326,12 @@
     short dstEref[100 * 100];
     short dstIref[100 * 100];
     short dstPref[100 * 100];
-    pixel pDstAvec[100 * 100];
-    pixel pDstAref[100 * 100];
-    pixel pDstBvec[100 * 100];
-    pixel pDstBref[100 * 100];
-    pixel pDstCvec[100 * 100];
-    pixel pDstCref[100 * 100];
+    pixel pDstAvec[200 * 200];
+    pixel pDstAref[200 * 200];
+    pixel pDstBvec[200 * 200];
+    pixel pDstBref[200 * 200];
+    pixel pDstCvec[200 * 200];
+    pixel pDstCref[200 * 200];
 
     memset(dstAref, 0, 10000 * sizeof(short));
     memset(dstEref, 0, 10000 * sizeof(short));
@@ -340,34 +341,37 @@
     memset(dstEvec, 0, 10000 * sizeof(short));
     memset(dstIvec, 0, 10000 * sizeof(short));
     memset(dstPvec, 0, 10000 * sizeof(short));
-    memset(pDstAvec, 0, 10000 * sizeof(pixel));
-    memset(pDstAref, 0, 10000 * sizeof(pixel));
-    memset(pDstBvec, 0, 10000 * sizeof(pixel));
-    memset(pDstBref, 0, 10000 * sizeof(pixel));
-    memset(pDstCvec, 0, 10000 * sizeof(pixel));
-    memset(pDstCref, 0, 10000 * sizeof(pixel));
+    memset(pDstAvec, 0, 40000 * sizeof(pixel));
+    memset(pDstAref, 0, 40000 * sizeof(pixel));
+    memset(pDstBvec, 0, 40000 * sizeof(pixel));
+    memset(pDstBref, 0, 40000 * sizeof(pixel));
+    memset(pDstCvec, 0, 40000 * sizeof(pixel));
+    memset(pDstCref, 0, 40000 * sizeof(pixel));
 
     for (int i = 0; i <= 100; i++)
     {
-        rand_srcStride = 64;               // Can be randomly generated
-        rand_dstStride = 64;
+        rand_height = (rand() % 32) + 1;
+        rand_width = (rand() % 32) + 8;
+        marginX = (rand()%16)+16;
+        marginY = (rand()%16)+16;
+        rand_srcStride = rand_width;               // Can be randomly generated
+        rand_dstStride = rand_width+2*marginX;
         opt(8, pixel_buff + 3 * rand_srcStride,
             rand_srcStride,
             dstAvec, dstEvec, dstIvec, dstPvec,
-            rand_dstStride, pDstAvec, pDstBvec, pDstCvec, rand_dstStride,
-            rand_width,
-            rand_height);
+            rand_dstStride, pDstAvec+marginY*rand_dstStride+marginX, pDstBvec+marginY*rand_dstStride+marginX, pDstCvec+marginY*rand_dstStride+marginX, 
+            rand_dstStride, rand_width, rand_height, marginX,marginY);
         ref(8, pixel_buff + 3 * rand_srcStride,
             rand_srcStride,
             dstAref, dstEref, dstIref, dstPref,
-            rand_dstStride, pDstAref, pDstBref, pDstCref, rand_dstStride,
+            rand_dstStride, pDstAref+marginY*rand_dstStride+marginX, pDstBref+marginY*rand_dstStride+marginX, pDstCref+marginY*rand_dstStride+marginX, rand_dstStride,
             rand_width,
-            rand_height);
+            rand_height,  marginX,marginY);
 
         if (memcmp(dstAvec, dstAref, 100 * 100 * sizeof(short)) || memcmp(dstEvec, dstEref, 100 * 100 * sizeof(short))
             || memcmp(dstIvec, dstIref, 100 * 100 * sizeof(short)) || memcmp(dstPvec, dstPref, 100 * 100 * sizeof(short))
-            || memcmp(pDstAvec, pDstAref, 100 * 100 * sizeof(pixel)) || memcmp(pDstBvec, pDstBref, 100 * 100 * sizeof(pixel))
-            || memcmp(pDstCvec, pDstCref, 100 * 100 * sizeof(pixel))
+            || memcmp(pDstAvec, pDstAref, 200 * 200 * sizeof(pixel)) || memcmp(pDstBvec, pDstBref, 200 * 200 * sizeof(pixel))
+            || memcmp(pDstCvec, pDstCref, 200 * 200 * sizeof(pixel))
             )
         {
             return false;
@@ -517,6 +521,6 @@
     {
         printf("Filter-H-multiplane\t");
         REPORT_SPEEDUP(opt.filterHmulti, ref.filterHmulti,
-                       8, pixel_buff + 8 * srcStride, srcStride, IPF_vec_output_s, IPF_C_output_s, IPF_vec_output_s, IPF_C_output_s, dstStride, IPF_vec_output_p, IPF_C_output_p, IPF_vec_output_p, dstStride, width, height);
+                       8, pixel_buff + 8 * srcStride, srcStride, IPF_vec_output_s, IPF_C_output_s, IPF_vec_output_s, IPF_C_output_s, dstStride, IPF_vec_output_p+ 64 * 200 + 64, IPF_C_output_p+ 64 * 200 + 64, IPF_vec_output_p+ 64 * 200 + 64, dstStride, width, height,64,64);
     }
 }
-------------- next part --------------
A non-text attachment was scrubbed...
Name: xhevc_deepthid-1.patch
Type: text/x-patch
Size: 41193 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20130628/1afd1305/attachment-0001.bin>