[x265] [PATCH] primitives: filter and Extend the Col only for CU Row

Fri Aug 23 10:06:24 CEST 2013

# HG changeset patch
# User ggopu
# Date 1377245158 -19800
# Node ID 361265127eae4985ed7baa3a9c55e23ae2f2d470
# Parent  e187433abd5e6cc628c26282ecc7452154fc9042
primitives: filter and Extend the Col only for CU Row

diff -r e187433abd5e -r 361265127eae source/common/ipfilter.cpp

--- a/source/common/ipfilter.cpp	Fri Aug 23 13:23:43 2013 +0530
+++ b/source/common/ipfilter.cpp	Fri Aug 23 13:35:58 2013 +0530
@@ -462,6 +462,22 @@
     }
 }
 
+void extendCURowColBorder(pixel* txt, int stride, int width, int height, int marginX)
+{
+    int   x, y;
+
+    for (y = 0; y < height; y++)
+    {
+        for (x = 0; x < marginX; x++)
+        {
+            txt[-marginX + x] = txt[0];
+            txt[width + x] = txt[width - 1];
+        }
+
+        txt += stride;
+    }
+}
+
 void filterVerticalMultiplaneExtend(short *src, int srcStride, pixel *dstE, pixel *dstI, pixel *dstP, int dstStride, int block_width, int block_height, int marginX, int marginY)
 {
     filterVertical_s_p<8>(src, srcStride, dstI, dstStride, block_width, block_height, g_lumaFilter[2]);
@@ -487,6 +503,23 @@
     extendPicCompBorder(pDstC, pDstStride, block_width, block_height, marginX, marginY);
 }
 
+void filterHorizontalExtendCol(pixel *src, int srcStride, short *midF, short* midA, short* midB, short* midC, int midStride, pixel *pDstA, pixel *pDstB, pixel *pDstC, int pDstStride, int block_width, int block_height, int marginX)
+{
+    filterConvertPelToShort(src, srcStride, midF, midStride, block_width, block_height);
+    filterHorizontal_p_s<8>(src, srcStride, midB, midStride, block_width, block_height, g_lumaFilter[2]);
+    filterHorizontal_p_s<8>(src, srcStride, midA, midStride, block_width, block_height, g_lumaFilter[1]);
+    filterHorizontal_p_s<8>(src, srcStride, midC, midStride, block_width, block_height, g_lumaFilter[3]);
+    
+    filterConvertShortToPel(midA, midStride, pDstA, pDstStride, block_width, block_height);
+    filterConvertShortToPel(midB, midStride, pDstB, pDstStride, block_width, block_height);
+    filterConvertShortToPel(midC, midStride, pDstC, pDstStride, block_width, block_height);
+    
+    extendCURowColBorder(pDstA, pDstStride, block_width, block_height, marginX);
+    extendCURowColBorder(pDstB, pDstStride, block_width, block_height, marginX);
+    extendCURowColBorder(pDstC, pDstStride, block_width, block_height, marginX);
+
+}
+
 void weightUnidir(short *src, pixel *dst, int srcStride, int dstStride, int width, int height, int scale, int round, int shift, int offset)
 {
     Int shiftNum = IF_INTERNAL_PREC - X265_DEPTH;
@@ -587,5 +620,7 @@
 
     p.filterVwghtd = filterVerticalWeighted;         
     p.filterHwghtd = filterHorizontalWeighted;
+    
+    p.filterHCU = filterHorizontalExtendCol;
 }
 }
diff -r e187433abd5e -r 361265127eae source/common/primitives.h
--- a/source/common/primitives.h	Fri Aug 23 13:23:43 2013 +0530
+++ b/source/common/primitives.h	Fri Aug 23 13:35:58 2013 +0530
@@ -231,6 +231,8 @@
 typedef void (*scale_t)(pixel *dst, pixel *src, intptr_t stride);
 typedef void (*downscale_t)(pixel *src0, pixel *dstf, pixel *dsth, pixel *dstv, pixel *dstc,
                             intptr_t src_stride, intptr_t dst_stride, int width, int height);
+typedef void (*cuRowfilterHmulti_t)(pixel *src, int srcStride, short *midF, short* midA, short* midB, short* midC, int midStride,
+                               pixel *dstA, pixel *dstB, pixel *dstC, int dstStride, int block_width, int block_height, int marginX);
 
 /* Define a structure containing function pointers to optimized encoder
  * primitives.  Each pointer can reference either an assembly routine,
@@ -292,6 +294,8 @@
     scale_t         scale1D_128to64;
     scale_t         scale2D_64to32;
     downscale_t     frame_init_lowres_core;
+
+    cuRowfilterHmulti_t filterHCU;
 };
 
 /* This copy of the table is what gets used by the encoder.
diff -r e187433abd5e -r 361265127eae source/common/vec/ipfilter.inc
--- a/source/common/vec/ipfilter.inc	Fri Aug 23 13:23:43 2013 +0530
+++ b/source/common/vec/ipfilter.inc	Fri Aug 23 13:35:58 2013 +0530
@@ -66,6 +66,7 @@
 #if !(defined(_MSC_VER) && _MSC_VER == 1500 && X86_64)
     p.filterHmulti = filterHorizontalMultiplaneExtend;
     p.filterHwghtd = filterHorizontalWeighted;
+    p.filterHCU     = filterHorizontalExtendCol;
 #endif
 #endif
 }
diff -r e187433abd5e -r 361265127eae source/common/vec/ipfilter8.inc
--- a/source/common/vec/ipfilter8.inc	Fri Aug 23 13:23:43 2013 +0530
+++ b/source/common/vec/ipfilter8.inc	Fri Aug 23 13:35:58 2013 +0530
@@ -1802,6 +1802,284 @@
     }
 }
 
+void filterHorizontalExtendCol(pixel *src, int srcStride,
+                                      short *intF, short* intA, short* intB, short* intC, int intStride,
+                                      pixel *dstA, pixel *dstB, pixel *dstC, int dstStride,
+                                      int block_width, int block_height,
+                                      int marginX)
+{
+    int row, col;
+    int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
+    int shift = IF_FILTER_PREC - headRoom;
+    int offset = -IF_INTERNAL_OFFS << shift;
+
+    src -= (8 / 2 - 1);
+    __m128i vec_src0;
+    __m128i vec_offset = _mm_set1_epi16(offset);
+    __m128i sumaL, sumbL, sumcL, tmp, exp1;
+    __m128i tmp16a, tmp16b, tmp16c;
+
+    // Load Ai, ai += Ai*coefi
+    for (row = 0; row < block_height; row++)
+    {
+        col = 0;
+
+        vec_src0 = _mm_loadu_si128((__m128i const*)(src + col));
+        sumbL = (_mm_unpacklo_epi8(vec_src0, _mm_setzero_si128()));
+        sumbL = _mm_sub_epi16(_mm_setzero_si128(), sumbL);
+
+        // a = b+=4*a1,  c+=1*a1
+        vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 1));
+        sumcL = _mm_unpacklo_epi8(vec_src0, _mm_setzero_si128());
+        sumbL = _mm_add_epi16(sumbL, _mm_sll_epi16(sumcL, _mm_cvtsi32_si128(2)));
+        sumaL = sumbL;
+
+        // a +=-10*a2    b+=-11*a2      c+=-5*a2
+        vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 2));
+        tmp = _mm_unpacklo_epi8(vec_src0, _mm_setzero_si128());
+        sumbL = _mm_sub_epi16(sumbL, tmp);
+        tmp = _mm_mullo_epi16(tmp, _mm_set1_epi16(-5));
+        sumcL = _mm_add_epi16(sumcL, tmp);
+        tmp = _mm_sll_epi16(tmp, _mm_cvtsi32_si128(1));
+        sumaL = _mm_add_epi16(sumaL, tmp);
+        sumbL = _mm_add_epi16(sumbL, tmp);
+
+        // a +=58*a3    b+=40*a3      c+=17*a3
+        vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 3));
+        tmp = _mm_unpacklo_epi8(vec_src0, _mm_setzero_si128());
+        _mm_storeu_si128((__m128i*)(intF + col), _mm_sub_epi16(_mm_sll_epi16(tmp, _mm_cvtsi32_si128(6)), _mm_set1_epi16(IF_INTERNAL_OFFS)));
+        exp1 = _mm_add_epi16(tmp, _mm_sll_epi16(tmp, _mm_cvtsi32_si128(4)));
+        sumcL = _mm_add_epi16(sumcL, exp1);
+        sumaL = _mm_add_epi16(sumaL, tmp);
+        tmp = _mm_mullo_epi16(tmp, _mm_set1_epi16(40));
+        sumbL = _mm_add_epi16(sumbL, tmp);
+        sumaL = _mm_add_epi16(sumaL, _mm_add_epi16(exp1, tmp));
+
+        // a +=17*a4    b+=40*a4      c+=58*a4
+        vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 4));
+        tmp = _mm_unpacklo_epi8(vec_src0, _mm_setzero_si128());
+        exp1 = _mm_add_epi16(tmp, _mm_sll_epi16(tmp, _mm_cvtsi32_si128(4)));
+        sumaL = _mm_add_epi16(sumaL, exp1);
+        sumcL = _mm_add_epi16(sumcL, tmp);
+        tmp = _mm_mullo_epi16(tmp, _mm_set1_epi16(40));
+        sumbL = _mm_add_epi16(sumbL, tmp);
+        sumcL = _mm_add_epi16(sumcL, _mm_add_epi16(exp1, tmp));
+
+        // a +=-5*a5    b+=-11*a5      c+=-10*a5
+        vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 5));
+        tmp = _mm_unpacklo_epi8(vec_src0, _mm_setzero_si128());
+        sumbL = _mm_sub_epi16(sumbL, tmp);
+        tmp = _mm_mullo_epi16(tmp, _mm_set1_epi16(-5));
+        sumaL = _mm_add_epi16(sumaL, tmp);
+        tmp = _mm_sll_epi16(tmp, _mm_cvtsi32_si128(1));
+        sumcL = _mm_add_epi16(sumcL, tmp);
+        sumbL = _mm_add_epi16(sumbL, tmp);
+
+        // a +=1*a6    b+=4*a6      c+=4*a6
+        vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 6));
+        tmp = _mm_unpacklo_epi8(vec_src0, _mm_setzero_si128());
+        sumaL = _mm_add_epi16(sumaL, tmp);
+        tmp = _mm_sll_epi16(tmp, _mm_cvtsi32_si128(2));
+        sumbL = _mm_add_epi16(sumbL, tmp);
+        sumcL = _mm_add_epi16(sumcL, tmp);
+
+        // a +=0*a7    b+=-1*a7      c+=-1*a7
+        vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 7));
+        tmp = _mm_unpacklo_epi8(vec_src0, _mm_setzero_si128());
+        sumbL = _mm_sub_epi16(sumbL, tmp);
+        sumcL = _mm_sub_epi16(sumcL, tmp);
+        sumaL = _mm_add_epi16(sumaL, vec_offset);
+        sumbL = _mm_add_epi16(sumbL, vec_offset);
+        sumcL = _mm_add_epi16(sumcL, vec_offset);
+
+        _mm_storeu_si128((__m128i*)(intA + col), sumaL);
+        sumaL = _mm_add_epi16(sumaL, _mm_set1_epi16(IF_INTERNAL_OFFS + 32));
+        sumaL = _mm_sra_epi16(sumaL, _mm_cvtsi32_si128(6));
+        tmp16a = _mm_packus_epi16(sumaL, sumaL);
+        _mm_storel_epi64((__m128i*)(dstA + row * dstStride + col), tmp16a);
+
+        _mm_storeu_si128((__m128i*)(intB + col), sumbL);
+        sumbL = _mm_add_epi16(sumbL, _mm_set1_epi16(IF_INTERNAL_OFFS + 32));
+        sumbL = _mm_sra_epi16(sumbL, _mm_cvtsi32_si128(6));
+        tmp16b = _mm_packus_epi16(sumbL, sumbL);
+        _mm_storel_epi64((__m128i*)(dstB + row * dstStride + col), tmp16b);
+
+        _mm_storeu_si128((__m128i*)(intC + col), sumcL);
+        sumcL = _mm_add_epi16(sumcL, _mm_set1_epi16(IF_INTERNAL_OFFS + 32));
+        sumcL = _mm_sra_epi16(sumcL, _mm_cvtsi32_si128(6));
+        tmp16c = _mm_packus_epi16(sumcL, sumcL);
+        _mm_storel_epi64((__m128i*)(dstC + row * dstStride + col), tmp16c);
+
+        // Extend First column
+        __m128i ma, mb, mc;
+        ma = _mm_shuffle_epi8(tmp16a, _mm_set1_epi8(0));
+        mb = _mm_shuffle_epi8(tmp16b, _mm_set1_epi8(0));
+        mc = _mm_shuffle_epi8(tmp16c, _mm_set1_epi8(0));
+
+        for (int i = -marginX; i < -16; i += 16)
+        {
+            _mm_storeu_si128((__m128i*)(dstA + row * dstStride +  i), ma);
+            _mm_storeu_si128((__m128i*)(dstB + row * dstStride +  i), mb);
+            _mm_storeu_si128((__m128i*)(dstC + row * dstStride +  i), mc);
+        }
+
+        _mm_storeu_si128((__m128i*)(dstA + row * dstStride - 16), ma); /*Assuming marginX > 16*/
+        _mm_storeu_si128((__m128i*)(dstB + row * dstStride - 16), mb);
+        _mm_storeu_si128((__m128i*)(dstC + row * dstStride - 16), mc);
+
+        col += 8;
+
+        for (; col + 8 /*16*/ <= (block_width); col += 8 /*16*/)               // Iterations multiple of 8
+        {
+            vec_src0 = _mm_loadu_si128((__m128i const*)(src + col));
+            sumbL = (_mm_unpacklo_epi8(vec_src0, _mm_setzero_si128()));
+            sumbL = _mm_sub_epi16(_mm_setzero_si128(), sumbL);
+
+            // a = b+=4*a1,  c+=1*a1
+            vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 1));
+            sumcL = _mm_unpacklo_epi8(vec_src0, _mm_setzero_si128());
+            sumbL = _mm_add_epi16(sumbL, _mm_sll_epi16(sumcL, _mm_cvtsi32_si128(2)));
+            sumaL = sumbL;
+
+            // a +=-10*a2    b+=-11*a2      c+=-5*a2
+            vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 2));
+            tmp = _mm_unpacklo_epi8(vec_src0, _mm_setzero_si128());
+            sumbL = _mm_sub_epi16(sumbL, tmp);
+            tmp = _mm_mullo_epi16(tmp, _mm_set1_epi16(-5));
+            sumcL = _mm_add_epi16(sumcL, tmp);
+            tmp = _mm_sll_epi16(tmp, _mm_cvtsi32_si128(1));
+            sumaL = _mm_add_epi16(sumaL, tmp);
+            sumbL = _mm_add_epi16(sumbL, tmp);
+
+            // a +=58*a3    b+=40*a3      c+=17*a3
+            vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 3));
+            tmp = _mm_unpacklo_epi8(vec_src0, _mm_setzero_si128());
+            _mm_storeu_si128((__m128i*)(intF + col), _mm_sub_epi16(_mm_sll_epi16(tmp, _mm_cvtsi32_si128(6)), _mm_set1_epi16(IF_INTERNAL_OFFS)));
+            exp1 = _mm_add_epi16(tmp, _mm_sll_epi16(tmp, _mm_cvtsi32_si128(4)));
+            sumcL = _mm_add_epi16(sumcL, exp1);
+            sumaL = _mm_add_epi16(sumaL, tmp);
+            tmp = _mm_mullo_epi16(tmp, _mm_set1_epi16(40));
+            sumbL = _mm_add_epi16(sumbL, tmp);
+            sumaL = _mm_add_epi16(sumaL, _mm_add_epi16(exp1, tmp));
+
+            // a +=17*a4    b+=40*a4      c+=58*a4
+            vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 4));
+            tmp = _mm_unpacklo_epi8(vec_src0, _mm_setzero_si128());
+            exp1 = _mm_add_epi16(tmp, _mm_sll_epi16(tmp, _mm_cvtsi32_si128(4)));
+            sumaL = _mm_add_epi16(sumaL, exp1);
+            sumcL = _mm_add_epi16(sumcL, tmp);
+            tmp = _mm_mullo_epi16(tmp, _mm_set1_epi16(40));
+            sumbL = _mm_add_epi16(sumbL, tmp);
+            sumcL = _mm_add_epi16(sumcL, _mm_add_epi16(exp1, tmp));
+
+            // a +=-5*a5    b+=-11*a5      c+=-10*a5
+            vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 5));
+            tmp = _mm_unpacklo_epi8(vec_src0, _mm_setzero_si128());
+            sumbL = _mm_sub_epi16(sumbL, tmp);
+            tmp = _mm_mullo_epi16(tmp, _mm_set1_epi16(-5));
+            sumaL = _mm_add_epi16(sumaL, tmp);
+            tmp = _mm_sll_epi16(tmp, _mm_cvtsi32_si128(1));
+            sumcL = _mm_add_epi16(sumcL, tmp);
+            sumbL = _mm_add_epi16(sumbL, tmp);
+
+            // a +=1*a6    b+=4*a6      c+=4*a6
+            vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 6));
+            tmp = _mm_unpacklo_epi8(vec_src0, _mm_setzero_si128());
+            sumaL = _mm_add_epi16(sumaL, tmp);
+            tmp = _mm_sll_epi16(tmp, _mm_cvtsi32_si128(2));
+            sumbL = _mm_add_epi16(sumbL, tmp);
+            sumcL = _mm_add_epi16(sumcL, tmp);
+
+            // a +=0*a7    b+=-1*a7      c+=-1*a7
+            vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 7));
+            tmp = _mm_unpacklo_epi8(vec_src0, _mm_setzero_si128());
+            sumbL = _mm_sub_epi16(sumbL, tmp);
+            sumcL = _mm_sub_epi16(sumcL, tmp);
+            sumaL = _mm_add_epi16(sumaL, vec_offset);
+            sumbL = _mm_add_epi16(sumbL, vec_offset);
+            sumcL = _mm_add_epi16(sumcL, vec_offset);
+
+            _mm_storeu_si128((__m128i*)(intA + col), sumaL);
+            sumaL = _mm_add_epi16(sumaL, _mm_set1_epi16(IF_INTERNAL_OFFS + 32));
+            sumaL = _mm_sra_epi16(sumaL, _mm_cvtsi32_si128(6));
+            tmp16a = _mm_packus_epi16(sumaL, sumaL);
+            _mm_storel_epi64((__m128i*)(dstA + row * dstStride + col), tmp16a);
+
+            _mm_storeu_si128((__m128i*)(intB + col), sumbL);
+            sumbL = _mm_add_epi16(sumbL, _mm_set1_epi16(IF_INTERNAL_OFFS + 32));
+            sumbL = _mm_sra_epi16(sumbL, _mm_cvtsi32_si128(6));
+            tmp16b = _mm_packus_epi16(sumbL, sumbL);
+            _mm_storel_epi64((__m128i*)(dstB + row * dstStride + col), tmp16b);
+
+            _mm_storeu_si128((__m128i*)(intC + col), sumcL);
+            sumcL = _mm_add_epi16(sumcL, _mm_set1_epi16(IF_INTERNAL_OFFS + 32));
+            sumcL = _mm_sra_epi16(sumcL, _mm_cvtsi32_si128(6));
+            tmp16c = _mm_packus_epi16(sumcL, sumcL);
+            _mm_storel_epi64((__m128i*)(dstC + row * dstStride + col), tmp16c);
+        }
+
+        if (block_width - col > 0)
+        {
+            vec_src0 = _mm_loadu_si128((__m128i const*)(src + block_width - 5));
+            tmp = _mm_unpacklo_epi8(vec_src0, _mm_setzero_si128());
+            _mm_storeu_si128((__m128i*)(intF + block_width - 8), _mm_sub_epi16(_mm_sll_epi16(tmp, _mm_cvtsi32_si128(6)), _mm_set1_epi16(IF_INTERNAL_OFFS)));
+            __m128i a, b, c, sum1, sum2, sum3 = _mm_setzero_si128();
+            for (; col < block_width; col++)                           // Remaining iterations
+            {
+                vec_src0 = _mm_loadu_si128((__m128i const*)(src + col));
+                tmp = _mm_unpacklo_epi8(vec_src0, _mm_setzero_si128());    // Assuming that there is no overflow (Everywhere in this function!)
+                a = _mm_setr_epi16(-1, 4, -10, 58, 17,  -5, 1,  0);
+                a = _mm_mullo_epi16(tmp, a);
+                b = _mm_setr_epi16(-1, 4, -11, 40, 40, -11, 4, -1);
+                b = _mm_mullo_epi16(tmp, b);
+                c = _mm_setr_epi16(0, 1,  -5, 17, 58, -10, 4, -1);
+                c = _mm_mullo_epi16(tmp, c);
+                sum1  = _mm_hadd_epi16(a, b);                   // horizontally add 8 elements in 3 steps
+                sum2  = _mm_hadd_epi16(c, c);
+                sum2  = _mm_hadd_epi16(sum1, sum2);
+                sum3  = _mm_hadd_epi16(sum2, sum2);
+                sum3  = _mm_add_epi16(sum3, vec_offset);
+                sum3  = _mm_sra_epi16(sum3, _mm_cvtsi32_si128(shift));
+                intA[col] = _mm_cvtsi128_si32(sum3);
+                intB[col] = _mm_extract_epi16(sum3, 1);
+                intC[col] = _mm_extract_epi16(sum3, 2);
+                sum3 = _mm_add_epi16(sum3, _mm_set1_epi16(IF_INTERNAL_OFFS + 32));
+                sum3 = _mm_sra_epi16(sum3, _mm_cvtsi32_si128(6));
+                sum3 = _mm_packus_epi16(sum3, sum3);
+                dstA[row * dstStride + col] = _mm_extract_epi8(sum3, 0);
+                dstB[row * dstStride + col] = _mm_extract_epi8(sum3, 1);
+                dstC[row * dstStride + col] = _mm_extract_epi8(sum3, 2);
+            }
+
+            tmp16a = _mm_shuffle_epi8(sum3, _mm_set1_epi8(0));
+            tmp16b = _mm_shuffle_epi8(sum3, _mm_set1_epi8(1));
+            tmp16c = _mm_shuffle_epi8(sum3, _mm_set1_epi8(2));
+        }
+        else
+        {
+            tmp16a = _mm_shuffle_epi8(tmp16a, _mm_set1_epi8(15));
+            tmp16b = _mm_shuffle_epi8(tmp16b, _mm_set1_epi8(15));
+            tmp16c = _mm_shuffle_epi8(tmp16c, _mm_set1_epi8(15));
+        }
+        // Extend last column
+        for (int i = -marginX; i < -16; i += 16)
+        {
+            _mm_storeu_si128((__m128i*)(dstA + row * dstStride + block_width + marginX + i), tmp16a);
+            _mm_storeu_si128((__m128i*)(dstB + row * dstStride + block_width + marginX + i), tmp16b);
+            _mm_storeu_si128((__m128i*)(dstC + row * dstStride + block_width + marginX + i), tmp16c);
+        }
+
+        _mm_storeu_si128((__m128i*)(dstA + row * dstStride + block_width + marginX - 16), tmp16a); /*Assuming marginX > 16*/
+        _mm_storeu_si128((__m128i*)(dstB + row * dstStride + block_width + marginX - 16), tmp16b);
+        _mm_storeu_si128((__m128i*)(dstC + row * dstStride + block_width + marginX - 16), tmp16c);
+
+        src += srcStride;
+        intF += intStride;
+        intA += intStride;
+        intB += intStride;
+        intC += intStride;
+    }
+ }
 #endif /* if INSTRSET >= X265_CPU_LEVEL_SSE41 */
 
 #if INSTRSET >= X265_CPU_LEVEL_SSSE3
diff -r e187433abd5e -r 361265127eae source/test/ipfilterharness.cpp
--- a/source/test/ipfilterharness.cpp	Fri Aug 23 13:23:43 2013 +0530
+++ b/source/test/ipfilterharness.cpp	Fri Aug 23 13:35:58 2013 +0530
@@ -370,6 +370,73 @@
     return true;
 }
 
+bool IPFilterHarness::check_filterHMultiplaneCU(x265::cuRowfilterHmulti_t ref, x265::cuRowfilterHmulti_t opt)
+{
+    short rand_height;
+    short rand_width;
+    int rand_srcStride, rand_dstStride;
+    int marginX, marginY;
+
+    short *sbuf = new short[100 * 100 * 8];
+    short *dstAvec = sbuf;
+    short *dstEvec = dstAvec + 10000;
+    short *dstIvec = dstEvec + 10000;
+    short *dstPvec = dstIvec + 10000;
+    short *dstAref = dstPvec + 10000;
+    short *dstEref = dstAref + 10000;
+    short *dstIref = dstEref + 10000;
+    short *dstPref = dstIref + 10000;
+
+    pixel pDstAvec[200 * 200];
+    pixel pDstAref[200 * 200];
+    pixel pDstBvec[200 * 200];
+    pixel pDstBref[200 * 200];
+    pixel pDstCvec[200 * 200];
+    pixel pDstCref[200 * 200];
+
+    memset(sbuf, 0, 10000 * sizeof(short) * 8);
+    memset(pDstAvec, 0, 40000 * sizeof(pixel));
+    memset(pDstAref, 0, 40000 * sizeof(pixel));
+    memset(pDstBvec, 0, 40000 * sizeof(pixel));
+    memset(pDstBref, 0, 40000 * sizeof(pixel));
+    memset(pDstCvec, 0, 40000 * sizeof(pixel));
+    memset(pDstCref, 0, 40000 * sizeof(pixel));
+
+    for (int i = 0; i <= 100; i++)
+    {
+        rand_height = (rand() % 32) + 1;
+        rand_width = (rand() % 32) + 8;
+        marginX = (rand() % 16) + 16;
+        marginY = (rand() % 16) + 16;
+        rand_srcStride = rand_width;               // Can be randomly generated
+        rand_dstStride = rand_width + 2 * marginX;
+        opt(pixel_buff + 8 * rand_srcStride, rand_srcStride,
+            dstAvec, dstEvec, dstIvec, dstPvec, rand_dstStride,
+            pDstAvec + marginY * rand_dstStride + marginX,
+            pDstBvec + marginY * rand_dstStride + marginX,
+            pDstCvec + marginY * rand_dstStride + marginX, rand_dstStride,
+            rand_width, rand_height, marginX);
+        ref(pixel_buff + 8 * rand_srcStride, rand_srcStride,
+            dstAref, dstEref, dstIref, dstPref, rand_dstStride,
+            pDstAref + marginY * rand_dstStride + marginX,
+            pDstBref + marginY * rand_dstStride + marginX,
+            pDstCref + marginY * rand_dstStride + marginX, rand_dstStride,
+            rand_width, rand_height, marginX);
+
+        if (memcmp(dstAvec, dstAref, 100 * 100 * sizeof(short)) || memcmp(dstEvec, dstEref, 100 * 100 * sizeof(short)) ||
+            memcmp(dstIvec, dstIref, 100 * 100 * sizeof(short)) || memcmp(dstPvec, dstPref, 100 * 100 * sizeof(short)) ||
+            memcmp(pDstAvec, pDstAref, 200 * 200 * sizeof(pixel)) || memcmp(pDstBvec, pDstBref, 200 * 200 * sizeof(pixel)) ||
+            memcmp(pDstCvec, pDstCref, 200 * 200 * sizeof(pixel)))
+        {
+            return false;
+        }
+    }
+
+    delete [] sbuf;
+
+    return true;
+}
+
 bool IPFilterHarness::check_filterHMultiplaneWghtd(x265::filterHwghtd_t ref, x265::filterHwghtd_t opt)
 {
     short rand_height;
@@ -589,7 +656,17 @@
             return false;
         }
     }
+    
 
+    if (opt.filterHCU)
+    {
+        if (!check_filterHMultiplaneCU(ref.filterHCU, opt.filterHCU))
+        {
+            printf("Filter-H-multiplane for CU ROW failed\n");
+            return false;
+        }
+    }
+    
     return true;
 }
 
@@ -681,4 +758,11 @@
         REPORT_SPEEDUP(opt.filterVwghtd, ref.filterVwghtd,
                        short_buff + 8 * srcStride, srcStride, IPF_C_output_p + 64 * 200 + 64, IPF_vec_output_p + 64 * 200 + 64, IPF_C_output_p + 64 * 200 + 64, dstStride, width, height, 64, 64, w, round, shift, offset);
     }
+    
+    if (opt.filterHCU)
+    {
+        printf("Filter-H-multiplane for CU ROW");
+        REPORT_SPEEDUP(opt.filterHCU, ref.filterHCU,
+                       pixel_buff + 8 * srcStride, srcStride, IPF_vec_output_s, IPF_C_output_s, IPF_vec_output_s, IPF_C_output_s, dstStride, IPF_vec_output_p + 64 * 200 + 64, IPF_C_output_p + 64 * 200 + 64, IPF_vec_output_p + 64 * 200 + 64, dstStride, width, height, 64);
+    }
 }
diff -r e187433abd5e -r 361265127eae source/test/ipfilterharness.h
--- a/source/test/ipfilterharness.h	Fri Aug 23 13:23:43 2013 +0530
+++ b/source/test/ipfilterharness.h	Fri Aug 23 13:35:58 2013 +0530
@@ -49,6 +49,7 @@
     bool check_filterHMultiplane(x265::filterHmulti_t ref, x265::filterHmulti_t opt);
     bool check_filterHMultiplaneWghtd(x265::filterHwghtd_t ref, x265::filterHwghtd_t opt);
     bool check_filterVMultiplaneWghtd(x265::filterVwghtd_t ref, x265::filterVwghtd_t opt);
+    bool check_filterHMultiplaneCU(x265::cuRowfilterHmulti_t ref, x265::cuRowfilterHmulti_t opt);
 
 public: