[x265] [PATCH 1 of 5] ipfilter: remove unused weighted interpolation primitives

Mon Oct 7 05:45:34 CEST 2013

# HG changeset patch
# User Steve Borho <steve at borho.org>
# Date 1381116610 18000
#      Sun Oct 06 22:30:10 2013 -0500
# Node ID 15c6e2451b347782300b6fe764cb4c9dac35a43a
# Parent  c010342f7605c86867824f5b525a8f84c0d2de1c
ipfilter: remove unused weighted interpolation primitives

diff -r c010342f7605 -r 15c6e2451b34 source/common/ipfilter.cpp

--- a/source/common/ipfilter.cpp	Sun Oct 06 02:09:00 2013 -0500
+++ b/source/common/ipfilter.cpp	Sun Oct 06 22:30:10 2013 -0500
@@ -493,56 +493,6 @@
         dst  += dstStride;
     }
 }
-
-// filterHorizontal, Multiplane, Weighted
-void filterHorizontalWeighted(pixel *src, intptr_t srcStride, short *midF, short* midA, short* midB, short* midC, intptr_t midStride,
-                              pixel *dstF, pixel *dstA, pixel *dstB, pixel *dstC, intptr_t dstStride, int block_width, int block_height,
-                              int marginX, int marginY, int scale, int round, int shift, int offset)
-{
-    filterConvertPelToShort(src, srcStride, midF, midStride, block_width, block_height);
-    filterHorizontal_p_s<8>(src, srcStride, midB, midStride, block_width, block_height, g_lumaFilter[2]);
-    filterHorizontal_p_s<8>(src, srcStride, midA, midStride, block_width, block_height, g_lumaFilter[1]);
-    filterHorizontal_p_s<8>(src, srcStride, midC, midStride, block_width, block_height, g_lumaFilter[3]);
-
-    weightUnidir(midF, dstF, midStride, dstStride, block_width, block_height, scale, round, shift, offset);
-    weightUnidir(midA, dstA, midStride, dstStride, block_width, block_height, scale, round, shift, offset);
-    weightUnidir(midB, dstB, midStride, dstStride, block_width, block_height, scale, round, shift, offset);
-    weightUnidir(midC, dstC, midStride, dstStride, block_width, block_height, scale, round, shift, offset);
-
-    extendPicCompBorder(dstF, dstStride, block_width, block_height, marginX, marginY);
-    extendPicCompBorder(dstA, dstStride, block_width, block_height, marginX, marginY);
-    extendPicCompBorder(dstB, dstStride, block_width, block_height, marginX, marginY);
-    extendPicCompBorder(dstC, dstStride, block_width, block_height, marginX, marginY);
-}
-
-// filterVertical, Multiplane, Weighted
-void filterVerticalWeighted(short *src, intptr_t srcStride, pixel *dstE, pixel *dstI, pixel *dstP,
-                            intptr_t dstStride, int block_width, int block_height, int marginX, int marginY,
-                            int scale, int round, int shift, int offset)
-{
-    short* intI, *intE, *intP;    
-    int intStride = block_width;
-
-    intI = (short*)X265_MALLOC(short, block_height * block_width);
-    intE = (short*)X265_MALLOC(short, block_height * block_width);
-    intP = (short*)X265_MALLOC(short, block_height * block_width);
-
-    filterVertical_s_s<8>(src, srcStride, intI, intStride, block_width, block_height, g_lumaFilter[2]);
-    filterVertical_s_s<8>(src, srcStride, intE, intStride, block_width, block_height, g_lumaFilter[1]);
-    filterVertical_s_s<8>(src, srcStride, intP, intStride, block_width, block_height, g_lumaFilter[3]);
-
-    weightUnidir(intI, dstI, intStride, dstStride,block_width, block_height, scale, round, shift, offset);
-    weightUnidir(intE, dstE, intStride, dstStride,block_width, block_height, scale, round, shift, offset);
-    weightUnidir(intP, dstP, intStride, dstStride,block_width, block_height, scale, round, shift, offset);
-
-    extendPicCompBorder(dstE, dstStride, block_width, block_height, marginX, marginY);
-    extendPicCompBorder(dstI, dstStride, block_width, block_height, marginX, marginY);
-    extendPicCompBorder(dstP, dstStride, block_width, block_height, marginX, marginY);
-
-    X265_FREE(intI);
-    X265_FREE(intE);
-    X265_FREE(intP);
-}
 }
 
 void filterRowH(pixel *src, intptr_t srcStride, short* midA, short* midB, short* midC, intptr_t midStride, pixel *dstA, pixel *dstB, pixel *dstC, int width, int height, int marginX, int marginY, int row, int isLastRow)
@@ -754,9 +704,6 @@
     p.filterRowV_0 = filterRowV_0;
     p.filterRowV_N = filterRowV_N;
 
-    p.filterVwghtd = filterVerticalWeighted;         
-    p.filterHwghtd = filterHorizontalWeighted;
-    
     p.extendRowBorder = extendCURowColBorder;
 }
 }
diff -r c010342f7605 -r 15c6e2451b34 source/common/primitives.h
--- a/source/common/primitives.h	Sun Oct 06 02:09:00 2013 -0500
+++ b/source/common/primitives.h	Sun Oct 06 22:30:10 2013 -0500
@@ -219,11 +219,6 @@
 typedef void (*dequant_t)(const int* src, int* dst, int width, int height, int mcqp_miper, int mcqp_mirem, bool useScalingList,
                           unsigned int trSizeLog2, int *dequantCoef);
 
-typedef void (*filterVwghtd_t)(short *src, intptr_t srcStride, pixel *dstE, pixel *dstI, pixel *dstP, intptr_t dstStride, int block_width,
-                               int block_height, int marginX, int marginY, int w, int roundw, int shiftw, int offsetw);
-typedef void (*filterHwghtd_t)(pixel *src, intptr_t srcStride, short *midF, short* midA, short* midB, short* midC, intptr_t midStride,
-                               pixel *dstF, pixel *dstA, pixel *dstB, pixel *dstC, intptr_t dstStride, int block_width, int block_height,
-                               int marginX, int marginY, int w, int roundw, int shiftw, int offsetw);
 typedef void (*filterRowH_t)(pixel *src, intptr_t srcStride, short* midA, short* midB, short* midC, intptr_t midStride, pixel *dstA, pixel *dstB, pixel *dstC, int width, int height, int marginX, int marginY, int row, int isLastRow);
 typedef void (*filterRowV_0_t)(pixel *src, intptr_t srcStride, pixel *dstA, pixel *dstB, pixel *dstC, int width, int height, int marginX, int marginY, int row, int isLastRow);
 typedef void (*filterRowV_N_t)(short *midA, intptr_t midStride, pixel *dstA, pixel *dstB, pixel *dstC, intptr_t dstStride, int width, int height, int marginX, int marginY, int row, int isLastRow);
@@ -273,7 +268,6 @@
     filterRowV_N_t  filterRowV_N;
     extendCURowBorder_t extendRowBorder;
 
-
     intra_dc_t      intra_pred_dc;
     intra_planar_t  intra_pred_planar;
     intra_ang_t     intra_pred_ang;
@@ -295,9 +289,6 @@
     pixeladd_pp_t   pixeladd_pp;
     pixelavg_pp_t   pixelavg_pp[NUM_PARTITIONS];
 
-    filterVwghtd_t  filterVwghtd;
-    filterHwghtd_t  filterHwghtd;
-
     scale_t         scale1D_128to64;
     scale_t         scale2D_64to32;
     downscale_t     frame_init_lowres_core;
diff -r c010342f7605 -r 15c6e2451b34 source/common/vec/ipfilter-sse41.cpp
--- a/source/common/vec/ipfilter-sse41.cpp	Sun Oct 06 02:09:00 2013 -0500
+++ b/source/common/vec/ipfilter-sse41.cpp	Sun Oct 06 22:30:10 2013 -0500
@@ -1652,716 +1652,6 @@
         dst += dstStride;
     }
 }
-
-ALIGN_VAR_32(const uint8_t, ipfilterH_0[][16]) =
-{
-    {0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8},
-    {2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10},
-    {4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12},
-    {6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14},
-};
-
-ALIGN_VAR_32(const int8_t, ipfilterH_1[][16]) =
-{
-    {-1, 4, -10, 58, 17,  -5, 1,  0, -1, 4, -10, 58, 17,  -5, 1,  0},
-    {-1, 4, -11, 40, 40, -11, 4, -1, -1, 4, -11, 40, 40, -11, 4, -1},
-    { 0, 1,  -5, 17, 58, -10, 4, -1,  0, 1,  -5, 17, 58, -10, 4, -1},
-};
-
-void filterHorizontalMultiplaneExtend(pixel *src, intptr_t srcStride,
-                                      short *intF, short* intA, short* intB, short* intC, intptr_t intStride,
-                                      pixel *dstA, pixel *dstB, pixel *dstC, intptr_t dstStride,
-                                      int block_width, int block_height,
-                                      int marginX, int marginY)
-{
-    int row, col;
-    int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
-    int shift = IF_FILTER_PREC - headRoom;
-    int offset = -IF_INTERNAL_OFFS << shift;
-
-    src -= (8 / 2 - 1);
-    __m128i vec_src0;
-    __m128i vec_offset = _mm_set1_epi16(offset);
-    __m128i tmp;
-    __m128i tmp16a, tmp16b, tmp16c;
-
-    // Load Ai, ai += Ai*coefi
-    for (row = 0; row < block_height; row++)
-    {
-        col = 0;
-        __m128i ma, mb, mc;
-
-        const __m128i c_off = _mm_set1_epi16(IF_INTERNAL_OFFS);
-        const __m128i c_32 = _mm_set1_epi16(32);
-        __m128i T00;
-        __m128i T10, T11, T12, T13;
-        __m128i T20, T21, T22, T23;
-
-        T00 = _mm_loadu_si128((__m128i*)(src + col + 3));
-        T00 = _mm_unpacklo_epi8(T00, _mm_setzero_si128());
-        T00 = _mm_slli_epi16(T00, 6);
-        _mm_storeu_si128((__m128i*)(intF + col), _mm_sub_epi16(T00, c_off));
-
-        T00 = _mm_loadu_si128((__m128i*)(src + col));
-
-        T10 = _mm_shuffle_epi8(T00, _mm_load_si128((__m128i*)ipfilterH_0[0]));
-        T11 = _mm_shuffle_epi8(T00, _mm_load_si128((__m128i*)ipfilterH_0[1]));
-        T12 = _mm_shuffle_epi8(T00, _mm_load_si128((__m128i*)ipfilterH_0[2]));
-        T13 = _mm_shuffle_epi8(T00, _mm_load_si128((__m128i*)ipfilterH_0[3]));
-
-        T20 = _mm_maddubs_epi16(T10, _mm_load_si128((__m128i*)ipfilterH_1[0]));
-        T21 = _mm_maddubs_epi16(T11, _mm_load_si128((__m128i*)ipfilterH_1[0]));
-        T22 = _mm_maddubs_epi16(T12, _mm_load_si128((__m128i*)ipfilterH_1[0]));
-        T23 = _mm_maddubs_epi16(T13, _mm_load_si128((__m128i*)ipfilterH_1[0]));
-        T20 = _mm_hadd_epi16(T20, T21);
-        T21 = _mm_hadd_epi16(T22, T23);
-        T20 = _mm_hadd_epi16(T20, T21);
-        _mm_storeu_si128((__m128i*)(intA + col), _mm_sub_epi16(T20, c_off));
-        T20 = _mm_srai_epi16(_mm_add_epi16(T20, c_32), 6);
-        T20 = _mm_packus_epi16(T20, T20);
-        _mm_storel_epi64((__m128i*)(dstA + row * dstStride + col), T20);
-
-        T20 = _mm_maddubs_epi16(T10, _mm_load_si128((__m128i*)ipfilterH_1[1]));
-        T21 = _mm_maddubs_epi16(T11, _mm_load_si128((__m128i*)ipfilterH_1[1]));
-        T22 = _mm_maddubs_epi16(T12, _mm_load_si128((__m128i*)ipfilterH_1[1]));
-        T23 = _mm_maddubs_epi16(T13, _mm_load_si128((__m128i*)ipfilterH_1[1]));
-        T20 = _mm_hadd_epi16(T20, T21);
-        T21 = _mm_hadd_epi16(T22, T23);
-        T20 = _mm_hadd_epi16(T20, T21);
-        _mm_storeu_si128((__m128i*)(intB + col), _mm_sub_epi16(T20, c_off));
-        T20 = _mm_srai_epi16(_mm_add_epi16(T20, c_32), 6);
-        T20 = _mm_packus_epi16(T20, T20);
-        _mm_storel_epi64((__m128i*)(dstB + row * dstStride + col), T20);
-
-        T20 = _mm_maddubs_epi16(T10, _mm_load_si128((__m128i*)ipfilterH_1[2]));
-        T21 = _mm_maddubs_epi16(T11, _mm_load_si128((__m128i*)ipfilterH_1[2]));
-        T22 = _mm_maddubs_epi16(T12, _mm_load_si128((__m128i*)ipfilterH_1[2]));
-        T23 = _mm_maddubs_epi16(T13, _mm_load_si128((__m128i*)ipfilterH_1[2]));
-        T20 = _mm_hadd_epi16(T20, T21);
-        T21 = _mm_hadd_epi16(T22, T23);
-        T20 = _mm_hadd_epi16(T20, T21);
-        _mm_storeu_si128((__m128i*)(intC + col), _mm_sub_epi16(T20, c_off));
-        T20 = _mm_srai_epi16(_mm_add_epi16(T20, c_32), 6);
-        T20 = _mm_packus_epi16(T20, T20);
-        _mm_storel_epi64((__m128i*)(dstC + row * dstStride + col), T20);
-
-        // Extend First column
-        ma = _mm_shuffle_epi8(_mm_cvtsi32_si128(dstA[row * dstStride]), _mm_setzero_si128());
-        mb = _mm_shuffle_epi8(_mm_cvtsi32_si128(dstB[row * dstStride]), _mm_setzero_si128());
-        mc = _mm_shuffle_epi8(_mm_cvtsi32_si128(dstC[row * dstStride]), _mm_setzero_si128());
-        for (int i = -marginX; i < -16; i += 16)
-        {
-            _mm_storeu_si128((__m128i*)(dstA + row * dstStride +  i), ma);
-            _mm_storeu_si128((__m128i*)(dstB + row * dstStride +  i), mb);
-            _mm_storeu_si128((__m128i*)(dstC + row * dstStride +  i), mc);
-        }
-
-        _mm_storeu_si128((__m128i*)(dstA + row * dstStride - 16), ma); /*Assuming marginX > 16*/
-        _mm_storeu_si128((__m128i*)(dstB + row * dstStride - 16), mb);
-        _mm_storeu_si128((__m128i*)(dstC + row * dstStride - 16), mc);
-
-        col += 8;
-
-        for (; col + 8 /*16*/ <= (block_width); col += 8 /*16*/)               // Iterations multiple of 8
-        {
-            T00 = _mm_loadu_si128((__m128i*)(src + col + 3));
-            T00 = _mm_unpacklo_epi8(T00, _mm_setzero_si128());
-            T00 = _mm_slli_epi16(T00, 6);
-            _mm_storeu_si128((__m128i*)(intF + col), _mm_sub_epi16(T00, c_off));
-
-            T00 = _mm_loadu_si128((__m128i*)(src + col));
-
-            T10 = _mm_shuffle_epi8(T00, _mm_load_si128((__m128i*)ipfilterH_0[0]));
-            T11 = _mm_shuffle_epi8(T00, _mm_load_si128((__m128i*)ipfilterH_0[1]));
-            T12 = _mm_shuffle_epi8(T00, _mm_load_si128((__m128i*)ipfilterH_0[2]));
-            T13 = _mm_shuffle_epi8(T00, _mm_load_si128((__m128i*)ipfilterH_0[3]));
-
-            T20 = _mm_maddubs_epi16(T10, _mm_load_si128((__m128i*)ipfilterH_1[0]));
-            T21 = _mm_maddubs_epi16(T11, _mm_load_si128((__m128i*)ipfilterH_1[0]));
-            T22 = _mm_maddubs_epi16(T12, _mm_load_si128((__m128i*)ipfilterH_1[0]));
-            T23 = _mm_maddubs_epi16(T13, _mm_load_si128((__m128i*)ipfilterH_1[0]));
-            T20 = _mm_hadd_epi16(T20, T21);
-            T21 = _mm_hadd_epi16(T22, T23);
-            T20 = _mm_hadd_epi16(T20, T21);
-            _mm_storeu_si128((__m128i*)(intA + col), _mm_sub_epi16(T20, c_off));
-            T20 = _mm_srai_epi16(_mm_add_epi16(T20, c_32), 6);
-            T20 = _mm_packus_epi16(T20, T20);
-            _mm_storel_epi64((__m128i*)(dstA + row * dstStride + col), T20);
-            tmp16a = T20;
-
-            T20 = _mm_maddubs_epi16(T10, _mm_load_si128((__m128i*)ipfilterH_1[1]));
-            T21 = _mm_maddubs_epi16(T11, _mm_load_si128((__m128i*)ipfilterH_1[1]));
-            T22 = _mm_maddubs_epi16(T12, _mm_load_si128((__m128i*)ipfilterH_1[1]));
-            T23 = _mm_maddubs_epi16(T13, _mm_load_si128((__m128i*)ipfilterH_1[1]));
-            T20 = _mm_hadd_epi16(T20, T21);
-            T21 = _mm_hadd_epi16(T22, T23);
-            T20 = _mm_hadd_epi16(T20, T21);
-            _mm_storeu_si128((__m128i*)(intB + col), _mm_sub_epi16(T20, c_off));
-            T20 = _mm_srai_epi16(_mm_add_epi16(T20, c_32), 6);
-            T20 = _mm_packus_epi16(T20, T20);
-            _mm_storel_epi64((__m128i*)(dstB + row * dstStride + col), T20);
-            tmp16b = T20;
-
-            T20 = _mm_maddubs_epi16(T10, _mm_load_si128((__m128i*)ipfilterH_1[2]));
-            T21 = _mm_maddubs_epi16(T11, _mm_load_si128((__m128i*)ipfilterH_1[2]));
-            T22 = _mm_maddubs_epi16(T12, _mm_load_si128((__m128i*)ipfilterH_1[2]));
-            T23 = _mm_maddubs_epi16(T13, _mm_load_si128((__m128i*)ipfilterH_1[2]));
-            T20 = _mm_hadd_epi16(T20, T21);
-            T21 = _mm_hadd_epi16(T22, T23);
-            T20 = _mm_hadd_epi16(T20, T21);
-            _mm_storeu_si128((__m128i*)(intC + col), _mm_sub_epi16(T20, c_off));
-            T20 = _mm_srai_epi16(_mm_add_epi16(T20, c_32), 6);
-            T20 = _mm_packus_epi16(T20, T20);
-            _mm_storel_epi64((__m128i*)(dstC + row * dstStride + col), T20);
-            tmp16c = T20;
-        }
-
-        // TODO: I think we may change algorithm to always alignment, so this code will be remove later
-        if (block_width - col > 0)
-        {
-            vec_src0 = _mm_loadu_si128((__m128i const*)(src + block_width - 5));
-            tmp = _mm_unpacklo_epi8(vec_src0, _mm_setzero_si128());
-            _mm_storeu_si128((__m128i*)(intF + block_width - 8), _mm_sub_epi16(_mm_sll_epi16(tmp, _mm_cvtsi32_si128(6)), _mm_set1_epi16(IF_INTERNAL_OFFS)));
-            __m128i a, b, c, sum1, sum2, sum3 = _mm_setzero_si128();
-            for (; col < block_width; col++)                           // Remaining iterations
-            {
-                vec_src0 = _mm_loadu_si128((__m128i const*)(src + col));
-                tmp = _mm_unpacklo_epi8(vec_src0, _mm_setzero_si128());    // Assuming that there is no overflow (Everywhere in this function!)
-                a = _mm_setr_epi16(-1, 4, -10, 58, 17,  -5, 1,  0);
-                a = _mm_mullo_epi16(tmp, a);
-                b = _mm_setr_epi16(-1, 4, -11, 40, 40, -11, 4, -1);
-                b = _mm_mullo_epi16(tmp, b);
-                c = _mm_setr_epi16(0, 1,  -5, 17, 58, -10, 4, -1);
-                c = _mm_mullo_epi16(tmp, c);
-                sum1  = _mm_hadd_epi16(a, b);                   // horizontally add 8 elements in 3 steps
-                sum2  = _mm_hadd_epi16(c, c);
-                sum2  = _mm_hadd_epi16(sum1, sum2);
-                sum3  = _mm_hadd_epi16(sum2, sum2);
-                sum3  = _mm_add_epi16(sum3, vec_offset);
-                sum3  = _mm_sra_epi16(sum3, _mm_cvtsi32_si128(shift));
-                intA[col] = _mm_cvtsi128_si32(sum3);
-                intB[col] = _mm_extract_epi16(sum3, 1);
-                intC[col] = _mm_extract_epi16(sum3, 2);
-                sum3 = _mm_add_epi16(sum3, _mm_set1_epi16(IF_INTERNAL_OFFS + 32));
-                sum3 = _mm_sra_epi16(sum3, _mm_cvtsi32_si128(6));
-                sum3 = _mm_packus_epi16(sum3, sum3);
-                dstA[row * dstStride + col] = _mm_extract_epi8(sum3, 0);
-                dstB[row * dstStride + col] = _mm_extract_epi8(sum3, 1);
-                dstC[row * dstStride + col] = _mm_extract_epi8(sum3, 2);
-            }
-        }
-
-        tmp16a = _mm_shuffle_epi8(_mm_cvtsi32_si128(dstA[row * dstStride + block_width - 1]), _mm_setzero_si128());
-        tmp16b = _mm_shuffle_epi8(_mm_cvtsi32_si128(dstB[row * dstStride + block_width - 1]), _mm_setzero_si128());
-        tmp16c = _mm_shuffle_epi8(_mm_cvtsi32_si128(dstC[row * dstStride + block_width - 1]), _mm_setzero_si128());
-
-        // Extend last column
-        for (int i = -marginX; i < -16; i += 16)
-        {
-            _mm_storeu_si128((__m128i*)(dstA + row * dstStride + block_width + marginX + i), tmp16a);
-            _mm_storeu_si128((__m128i*)(dstB + row * dstStride + block_width + marginX + i), tmp16b);
-            _mm_storeu_si128((__m128i*)(dstC + row * dstStride + block_width + marginX + i), tmp16c);
-        }
-
-        _mm_storeu_si128((__m128i*)(dstA + row * dstStride + block_width + marginX - 16), tmp16a); /*Assuming marginX > 16*/
-        _mm_storeu_si128((__m128i*)(dstB + row * dstStride + block_width + marginX - 16), tmp16b);
-        _mm_storeu_si128((__m128i*)(dstC + row * dstStride + block_width + marginX - 16), tmp16c);
-
-        src += srcStride;
-        intF += intStride;
-        intA += intStride;
-        intB += intStride;
-        intC += intStride;
-    }
-
-    // Extending bottom rows
-    pixel *pe, *pi, *pp;
-    pe = dstA + (block_height - 1) * dstStride - marginX;
-    pi = dstB + (block_height - 1) * dstStride - marginX;
-    pp = dstC + (block_height - 1) * dstStride - marginX;
-    for (int y = 1; y <= marginY; y++)
-    {
-        memcpy(pe + y * dstStride, pe, block_width + marginX * 2);
-    }
-
-    for (int y = 1; y <= marginY; y++)
-    {
-        memcpy(pi + y * dstStride, pi, block_width + marginX * 2);
-    }
-
-    for (int y = 1; y <= marginY; y++)
-    {
-        memcpy(pp + y * dstStride, pp, block_width + marginX * 2);
-    }
-
-    // Extending top rows
-    pe  = dstA - marginX;
-    pi  = dstB - marginX;
-    pp  = dstC - marginX;
-    for (int y = 1; y <= marginY; y++)
-    {
-        memcpy(pe - y * dstStride, pe, block_width + marginX * 2);
-    }
-
-    for (int y = 1; y <= marginY; y++)
-    {
-        memcpy(pi - y * dstStride, pi, block_width + marginX * 2);
-    }
-
-    for (int y = 1; y <= marginY; y++)
-    {
-        memcpy(pp - y * dstStride, pp, block_width + marginX * 2);
-    }
-}
-
-void filterHorizontalWeighted(pixel *src, intptr_t srcStride,
-                              short *intF, short* intA, short* intB, short* intC, intptr_t intStride,
-                              pixel * dstF, pixel *dstA, pixel *dstB, pixel *dstC, intptr_t dstStride,
-                              int block_width, int block_height,
-                              int marginX, int marginY, int scale, int wround, int wshift, int woffset)
-{
-    int row, col;
-    int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
-    int shift = IF_FILTER_PREC - headRoom;
-    int offset = -IF_INTERNAL_OFFS << shift;
-
-    src -= (8 / 2 - 1);
-    __m128i vec_src0;
-    __m128i vec_offset = _mm_set1_epi16(offset);
-    __m128i sumaL, sumbL, sumcL, tmp, exp1;
-    __m128i tmp16a, tmp16b, tmp16c, tmp16f, tmpwlo, tmpwhi;
-
-    int shiftNum = IF_INTERNAL_PREC - X265_DEPTH;
-    wshift = wshift + shiftNum;
-    wround = wshift ? (1 << (wshift - 1)) : 0;
-
-    __m128i vround = _mm_set1_epi32(wround + scale * IF_INTERNAL_OFFS);
-    __m128i ofs = _mm_set1_epi32(woffset);
-    __m128i vscale = _mm_set1_epi32(scale);
-
-    // Load Ai, ai += Ai*coefi
-    for (row = 0; row < block_height; row++)
-    {
-        col = 0;
-
-        vec_src0 = _mm_loadu_si128((__m128i const*)(src + col));
-        sumbL = (_mm_unpacklo_epi8(vec_src0, _mm_setzero_si128()));
-        sumbL = _mm_sub_epi16(_mm_setzero_si128(), sumbL);
-
-        // a = b+=4*a1,  c+=1*a1
-        vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 1));
-        sumcL = _mm_unpacklo_epi8(vec_src0, _mm_setzero_si128());
-        sumbL = _mm_add_epi16(sumbL, _mm_sll_epi16(sumcL, _mm_cvtsi32_si128(2)));
-        sumaL = sumbL;
-
-        // a +=-10*a2    b+=-11*a2      c+=-5*a2
-        vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 2));
-        tmp = _mm_unpacklo_epi8(vec_src0, _mm_setzero_si128());
-        sumbL = _mm_sub_epi16(sumbL, tmp);
-        tmp = _mm_mullo_epi16(tmp, _mm_set1_epi16(-5));
-        sumcL = _mm_add_epi16(sumcL, tmp);
-        tmp = _mm_slli_epi16(tmp, 1);
-        sumaL = _mm_add_epi16(sumaL, tmp);
-        sumbL = _mm_add_epi16(sumbL, tmp);
-
-        // a +=58*a3    b+=40*a3      c+=17*a3
-        vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 3));
-        tmp = _mm_unpacklo_epi8(vec_src0, _mm_setzero_si128());
-        tmp16f = _mm_sub_epi16(_mm_slli_epi16(tmp, 6), _mm_set1_epi16(IF_INTERNAL_OFFS));
-        _mm_storeu_si128((__m128i*)(intF + col), tmp16f);
-        //Apply weight on Full pel
-        tmpwlo = _mm_unpacklo_epi16(tmp16f, _mm_srai_epi16(tmp16f, 15));
-        tmpwlo = _mm_mullo_epi32(tmpwlo, vscale);
-        tmpwlo = _mm_add_epi32(tmpwlo, vround);
-        tmpwlo = _mm_srai_epi32(tmpwlo, wshift);
-        tmpwlo = _mm_add_epi32(tmpwlo, ofs);
-        tmpwhi = _mm_unpackhi_epi16(tmp16f, _mm_srai_epi16(tmp16f, 15));
-        tmpwhi = _mm_mullo_epi32(tmpwhi, vscale);
-        tmpwhi = _mm_add_epi32(tmpwhi, vround);
-        tmpwhi = _mm_srai_epi32(tmpwhi, wshift);
-        tmpwhi = _mm_add_epi32(tmpwhi, ofs);
-        tmp16f = _mm_packus_epi16(_mm_packs_epi32(tmpwlo, tmpwhi), _mm_setzero_si128());
-        _mm_storel_epi64((__m128i*)(dstF + row * dstStride + col), tmp16f);
-
-        exp1 = _mm_add_epi16(tmp, _mm_slli_epi16(tmp, 4));
-        sumcL = _mm_add_epi16(sumcL, exp1);
-        sumaL = _mm_add_epi16(sumaL, tmp);
-        tmp = _mm_mullo_epi16(tmp, _mm_set1_epi16(40));
-        sumbL = _mm_add_epi16(sumbL, tmp);
-        sumaL = _mm_add_epi16(sumaL, _mm_add_epi16(exp1, tmp));
-
-        // a +=17*a4    b+=40*a4      c+=58*a4
-        vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 4));
-        tmp = _mm_unpacklo_epi8(vec_src0, _mm_setzero_si128());
-        exp1 = _mm_add_epi16(tmp, _mm_slli_epi16(tmp, 4));
-        sumaL = _mm_add_epi16(sumaL, exp1);
-        sumcL = _mm_add_epi16(sumcL, tmp);
-        tmp = _mm_mullo_epi16(tmp, _mm_set1_epi16(40));
-        sumbL = _mm_add_epi16(sumbL, tmp);
-        sumcL = _mm_add_epi16(sumcL, _mm_add_epi16(exp1, tmp));
-
-        // a +=-5*a5    b+=-11*a5      c+=-10*a5
-        vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 5));
-        tmp = _mm_unpacklo_epi8(vec_src0, _mm_setzero_si128());
-        sumbL = _mm_sub_epi16(sumbL, tmp);
-        tmp = _mm_mullo_epi16(tmp, _mm_set1_epi16(-5));
-        sumaL = _mm_add_epi16(sumaL, tmp);
-        tmp = _mm_slli_epi16(tmp, 1);
-        sumcL = _mm_add_epi16(sumcL, tmp);
-        sumbL = _mm_add_epi16(sumbL, tmp);
-
-        // a +=1*a6    b+=4*a6      c+=4*a6
-        vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 6));
-        tmp = _mm_unpacklo_epi8(vec_src0, _mm_setzero_si128());
-        sumaL = _mm_add_epi16(sumaL, tmp);
-        tmp = _mm_slli_epi16(tmp, 2);
-        sumbL = _mm_add_epi16(sumbL, tmp);
-        sumcL = _mm_add_epi16(sumcL, tmp);
-
-        // a +=0*a7    b+=-1*a7      c+=-1*a7
-        vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 7));
-        tmp = _mm_unpacklo_epi8(vec_src0, _mm_setzero_si128());
-        sumbL = _mm_sub_epi16(sumbL, tmp);
-        sumcL = _mm_sub_epi16(sumcL, tmp);
-        sumaL = _mm_add_epi16(sumaL, vec_offset);
-        sumbL = _mm_add_epi16(sumbL, vec_offset);
-        sumcL = _mm_add_epi16(sumcL, vec_offset);
-
-        _mm_storeu_si128((__m128i*)(intA + col), sumaL);
-        //Apply weight
-        tmpwlo = _mm_unpacklo_epi16(sumaL, _mm_srai_epi16(sumaL, 15));
-        tmpwlo = _mm_mullo_epi32(tmpwlo, vscale);
-        tmpwlo = _mm_add_epi32(tmpwlo, vround);
-        tmpwlo = _mm_srai_epi32(tmpwlo, wshift);
-        tmpwlo = _mm_add_epi32(tmpwlo, ofs);
-        tmpwhi = _mm_unpackhi_epi16(sumaL, _mm_srai_epi16(sumaL, 15));
-        tmpwhi = _mm_mullo_epi32(tmpwhi, vscale);
-        tmpwhi = _mm_add_epi32(tmpwhi, vround);
-        tmpwhi = _mm_srai_epi32(tmpwhi, wshift);
-        tmpwhi = _mm_add_epi32(tmpwhi, ofs);
-        tmp16a = _mm_packus_epi16(_mm_packs_epi32(tmpwlo, tmpwhi), _mm_setzero_si128());
-        _mm_storel_epi64((__m128i*)(dstA + row * dstStride + col), tmp16a);
-
-        _mm_storeu_si128((__m128i*)(intB + col), sumbL);
-        //Apply weight
-        tmpwlo = _mm_unpacklo_epi16(sumbL, _mm_srai_epi16(sumbL, 15));
-        tmpwlo = _mm_mullo_epi32(tmpwlo, vscale);
-        tmpwlo = _mm_add_epi32(tmpwlo, vround);
-        tmpwlo = _mm_srai_epi32(tmpwlo, wshift);
-        tmpwlo = _mm_add_epi32(tmpwlo, ofs);
-        tmpwhi = _mm_unpackhi_epi16(sumbL, _mm_srai_epi16(sumbL, 15));
-        tmpwhi = _mm_mullo_epi32(tmpwhi, vscale);
-        tmpwhi = _mm_add_epi32(tmpwhi, vround);
-        tmpwhi = _mm_srai_epi32(tmpwhi, wshift);
-        tmpwhi = _mm_add_epi32(tmpwhi, ofs);
-        tmp16b = _mm_packus_epi16(_mm_packs_epi32(tmpwlo, tmpwhi), _mm_setzero_si128());
-        _mm_storel_epi64((__m128i*)(dstB + row * dstStride + col), tmp16b);
-
-        _mm_storeu_si128((__m128i*)(intC + col), sumcL);
-        //Apply weight
-        tmpwlo = _mm_unpacklo_epi16(sumcL, _mm_srai_epi16(sumcL, 15));
-        tmpwlo = _mm_mullo_epi32(tmpwlo, vscale);
-        tmpwlo = _mm_add_epi32(tmpwlo, vround);
-        tmpwlo = _mm_srai_epi32(tmpwlo, wshift);
-        tmpwlo = _mm_add_epi32(tmpwlo, ofs);
-        tmpwhi = _mm_unpackhi_epi16(sumcL, _mm_srai_epi16(sumcL, 15));
-        tmpwhi = _mm_mullo_epi32(tmpwhi, vscale);
-        tmpwhi = _mm_add_epi32(tmpwhi, vround);
-        tmpwhi = _mm_srai_epi32(tmpwhi, wshift);
-        tmpwhi = _mm_add_epi32(tmpwhi, ofs);
-        tmp16c = _mm_packus_epi16(_mm_packs_epi32(tmpwlo, tmpwhi), _mm_setzero_si128());
-        _mm_storel_epi64((__m128i*)(dstC + row * dstStride + col), tmp16c);
-
-        // Extend First column
-        __m128i ma, mb, mc, mf;
-        mf = _mm_shuffle_epi8(tmp16f, _mm_set1_epi8(0));
-        ma = _mm_shuffle_epi8(tmp16a, _mm_set1_epi8(0));
-        mb = _mm_shuffle_epi8(tmp16b, _mm_set1_epi8(0));
-        mc = _mm_shuffle_epi8(tmp16c, _mm_set1_epi8(0));
-
-        for (int i = -marginX; i < -16; i += 16)
-        {
-            _mm_storeu_si128((__m128i*)(dstF + row * dstStride +  i), mf);
-            _mm_storeu_si128((__m128i*)(dstA + row * dstStride +  i), ma);
-            _mm_storeu_si128((__m128i*)(dstB + row * dstStride +  i), mb);
-            _mm_storeu_si128((__m128i*)(dstC + row * dstStride +  i), mc);
-        }
-
-        _mm_storeu_si128((__m128i*)(dstF + row * dstStride - 16), mf); /*Assuming marginX > 16*/
-        _mm_storeu_si128((__m128i*)(dstA + row * dstStride - 16), ma);
-        _mm_storeu_si128((__m128i*)(dstB + row * dstStride - 16), mb);
-        _mm_storeu_si128((__m128i*)(dstC + row * dstStride - 16), mc);
-
-        col += 8;
-
-        for (; col + 8 /*16*/ <= (block_width); col += 8 /*16*/)               // Iterations multiple of 8
-        {
-            vec_src0 = _mm_loadu_si128((__m128i const*)(src + col));
-            sumbL = (_mm_unpacklo_epi8(vec_src0, _mm_setzero_si128()));
-            sumbL = _mm_sub_epi16(_mm_setzero_si128(), sumbL);
-
-            // a = b+=4*a1,  c+=1*a1
-            vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 1));
-            sumcL = _mm_unpacklo_epi8(vec_src0, _mm_setzero_si128());
-            sumbL = _mm_add_epi16(sumbL, _mm_slli_epi16(sumcL, 2));
-            sumaL = sumbL;
-
-            // a +=-10*a2    b+=-11*a2      c+=-5*a2
-            vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 2));
-            tmp = _mm_unpacklo_epi8(vec_src0, _mm_setzero_si128());
-            sumbL = _mm_sub_epi16(sumbL, tmp);
-            tmp = _mm_mullo_epi16(tmp, _mm_set1_epi16(-5));
-            sumcL = _mm_add_epi16(sumcL, tmp);
-            tmp = _mm_slli_epi16(tmp, 1);
-            sumaL = _mm_add_epi16(sumaL, tmp);
-            sumbL = _mm_add_epi16(sumbL, tmp);
-
-            // a +=58*a3    b+=40*a3      c+=17*a3
-            vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 3));
-            tmp = _mm_unpacklo_epi8(vec_src0, _mm_setzero_si128());
-            tmp16f = _mm_sub_epi16(_mm_slli_epi16(tmp, 6), _mm_set1_epi16(IF_INTERNAL_OFFS));
-            _mm_storeu_si128((__m128i*)(intF + col), tmp16f);
-            //Apply weight
-            tmpwlo = _mm_unpacklo_epi16(tmp16f, _mm_srai_epi16(tmp16f, 15));
-            tmpwlo = _mm_mullo_epi32(tmpwlo, vscale);
-            tmpwlo = _mm_add_epi32(tmpwlo, vround);
-            tmpwlo = _mm_srai_epi32(tmpwlo, wshift);
-            tmpwlo = _mm_add_epi32(tmpwlo, ofs);
-            tmpwhi = _mm_unpackhi_epi16(tmp16f, _mm_srai_epi16(tmp16f, 15));
-            tmpwhi = _mm_mullo_epi32(tmpwhi, vscale);
-            tmpwhi = _mm_add_epi32(tmpwhi, vround);
-            tmpwhi = _mm_srai_epi32(tmpwhi, wshift);
-            tmpwhi = _mm_add_epi32(tmpwhi, ofs);
-            tmp16f = _mm_packus_epi16(_mm_packs_epi32(tmpwlo, tmpwhi), _mm_setzero_si128());
-            _mm_storel_epi64((__m128i*)(dstF + row * dstStride + col), tmp16f);
-
-            exp1 = _mm_add_epi16(tmp, _mm_slli_epi16(tmp, 4));
-            sumcL = _mm_add_epi16(sumcL, exp1);
-            sumaL = _mm_add_epi16(sumaL, tmp);
-            tmp = _mm_mullo_epi16(tmp, _mm_set1_epi16(40));
-            sumbL = _mm_add_epi16(sumbL, tmp);
-            sumaL = _mm_add_epi16(sumaL, _mm_add_epi16(exp1, tmp));
-
-            // a +=17*a4    b+=40*a4      c+=58*a4
-            vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 4));
-            tmp = _mm_unpacklo_epi8(vec_src0, _mm_setzero_si128());
-            exp1 = _mm_add_epi16(tmp, _mm_slli_epi16(tmp, 4));
-            sumaL = _mm_add_epi16(sumaL, exp1);
-            sumcL = _mm_add_epi16(sumcL, tmp);
-            tmp = _mm_mullo_epi16(tmp, _mm_set1_epi16(40));
-            sumbL = _mm_add_epi16(sumbL, tmp);
-            sumcL = _mm_add_epi16(sumcL, _mm_add_epi16(exp1, tmp));
-
-            // a +=-5*a5    b+=-11*a5      c+=-10*a5
-            vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 5));
-            tmp = _mm_unpacklo_epi8(vec_src0, _mm_setzero_si128());
-            sumbL = _mm_sub_epi16(sumbL, tmp);
-            tmp = _mm_mullo_epi16(tmp, _mm_set1_epi16(-5));
-            sumaL = _mm_add_epi16(sumaL, tmp);
-            tmp = _mm_slli_epi16(tmp, 1);
-            sumcL = _mm_add_epi16(sumcL, tmp);
-            sumbL = _mm_add_epi16(sumbL, tmp);
-
-            // a +=1*a6    b+=4*a6      c+=4*a6
-            vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 6));
-            tmp = _mm_unpacklo_epi8(vec_src0, _mm_setzero_si128());
-            sumaL = _mm_add_epi16(sumaL, tmp);
-            tmp = _mm_slli_epi16(tmp, 2);
-            sumbL = _mm_add_epi16(sumbL, tmp);
-            sumcL = _mm_add_epi16(sumcL, tmp);
-
-            // a +=0*a7    b+=-1*a7      c+=-1*a7
-            vec_src0 = _mm_loadu_si128((__m128i const*)(src + col + 7));
-            tmp = _mm_unpacklo_epi8(vec_src0, _mm_setzero_si128());
-            sumbL = _mm_sub_epi16(sumbL, tmp);
-            sumcL = _mm_sub_epi16(sumcL, tmp);
-            sumaL = _mm_add_epi16(sumaL, vec_offset);
-            sumbL = _mm_add_epi16(sumbL, vec_offset);
-            sumcL = _mm_add_epi16(sumcL, vec_offset);
-
-            _mm_storeu_si128((__m128i*)(intA + col), sumaL);
-            //Apply weight
-            tmpwlo = _mm_unpacklo_epi16(sumaL, _mm_srai_epi16(sumaL, 15));
-            tmpwlo = _mm_mullo_epi32(tmpwlo, vscale);
-            tmpwlo = _mm_add_epi32(tmpwlo, vround);
-            tmpwlo = _mm_srai_epi32(tmpwlo, wshift);
-            tmpwlo = _mm_add_epi32(tmpwlo, ofs);
-            tmpwhi = _mm_unpackhi_epi16(sumaL, _mm_srai_epi16(sumaL, 15));
-            tmpwhi = _mm_mullo_epi32(tmpwhi, vscale);
-            tmpwhi = _mm_add_epi32(tmpwhi, vround);
-            tmpwhi = _mm_srai_epi32(tmpwhi, wshift);
-            tmpwhi = _mm_add_epi32(tmpwhi, ofs);
-            tmp16a = _mm_packus_epi16(_mm_packs_epi32(tmpwlo, tmpwhi), _mm_setzero_si128());
-            _mm_storel_epi64((__m128i*)(dstA + row * dstStride + col), tmp16a);
-
-            _mm_storeu_si128((__m128i*)(intB + col), sumbL);
-            //Apply weight
-            tmpwlo = _mm_unpacklo_epi16(sumbL, _mm_srai_epi16(sumbL, 15));
-            tmpwlo = _mm_mullo_epi32(tmpwlo, vscale);
-            tmpwlo = _mm_add_epi32(tmpwlo, vround);
-            tmpwlo = _mm_srai_epi32(tmpwlo, wshift);
-            tmpwlo = _mm_add_epi32(tmpwlo, ofs);
-            tmpwhi = _mm_unpackhi_epi16(sumbL, _mm_srai_epi16(sumbL, 15));
-            tmpwhi = _mm_mullo_epi32(tmpwhi, vscale);
-            tmpwhi = _mm_add_epi32(tmpwhi, vround);
-            tmpwhi = _mm_srai_epi32(tmpwhi, wshift);
-            tmpwhi = _mm_add_epi32(tmpwhi, ofs);
-            tmp16b = _mm_packus_epi16(_mm_packs_epi32(tmpwlo, tmpwhi), _mm_setzero_si128());
-            _mm_storel_epi64((__m128i*)(dstB + row * dstStride + col), tmp16b);
-
-            _mm_storeu_si128((__m128i*)(intC + col), sumcL);
-            //Apply weight
-            tmpwlo = _mm_unpacklo_epi16(sumcL, _mm_srai_epi16(sumcL, 15));
-            tmpwlo = _mm_mullo_epi32(tmpwlo, vscale);
-            tmpwlo = _mm_add_epi32(tmpwlo, vround);
-            tmpwlo = _mm_srai_epi32(tmpwlo, wshift);
-            tmpwlo = _mm_add_epi32(tmpwlo, ofs);
-            tmpwhi = _mm_unpackhi_epi16(sumcL, _mm_srai_epi16(sumcL, 15));
-            tmpwhi = _mm_mullo_epi32(tmpwhi, vscale);
-            tmpwhi = _mm_add_epi32(tmpwhi, vround);
-            tmpwhi = _mm_srai_epi32(tmpwhi, wshift);
-            tmpwhi = _mm_add_epi32(tmpwhi, ofs);
-            tmp16c = _mm_packus_epi16(_mm_packs_epi32(tmpwlo, tmpwhi), _mm_setzero_si128());
-            _mm_storel_epi64((__m128i*)(dstC + row * dstStride + col), tmp16c);
-        }
-
-        if (block_width - col > 0)
-        {
-            vec_src0 = _mm_loadu_si128((__m128i const*)(src + block_width - 5));
-            tmp = _mm_unpacklo_epi8(vec_src0, _mm_setzero_si128());
-            tmp = _mm_sub_epi16(_mm_slli_epi16(tmp, 6), _mm_set1_epi16(IF_INTERNAL_OFFS));
-            _mm_storeu_si128((__m128i*)(intF + block_width - 8), tmp);
-            //Apply weight
-            tmpwlo = _mm_unpacklo_epi16(tmp, _mm_srai_epi16(tmp, 15));
-            tmpwlo = _mm_mullo_epi32(tmpwlo, vscale);
-            tmpwlo = _mm_add_epi32(tmpwlo, vround);
-            tmpwlo = _mm_srai_epi32(tmpwlo, wshift);
-            tmpwlo = _mm_add_epi32(tmpwlo, ofs);
-            tmpwhi = _mm_unpackhi_epi16(tmp, _mm_srai_epi16(tmp, 15));
-            tmpwhi = _mm_mullo_epi32(tmpwhi, vscale);
-            tmpwhi = _mm_add_epi32(tmpwhi, vround);
-            tmpwhi = _mm_srai_epi32(tmpwhi, wshift);
-            tmpwhi = _mm_add_epi32(tmpwhi, ofs);
-            tmp = _mm_packus_epi16(_mm_packs_epi32(tmpwlo, tmpwhi), _mm_setzero_si128());
-            _mm_storel_epi64((__m128i*)(dstF + row * dstStride + block_width - 8), tmp);
-            tmp16f = _mm_shuffle_epi8(tmp, _mm_set1_epi8(7));
-
-            __m128i a, b, c, sum1, sum2, sum3 = _mm_setzero_si128();
-            for (; col < block_width; col++)                           // Remaining iterations
-            {
-                vec_src0 = _mm_loadu_si128((__m128i const*)(src + col));
-                tmp = _mm_unpacklo_epi8(vec_src0, _mm_setzero_si128());    // Assuming that there is no overflow (Everywhere in this function!)
-                a = _mm_setr_epi16(-1, 4, -10, 58, 17,  -5, 1,  0);
-                a = _mm_mullo_epi16(tmp, a);
-                b = _mm_setr_epi16(-1, 4, -11, 40, 40, -11, 4, -1);
-                b = _mm_mullo_epi16(tmp, b);
-                c = _mm_setr_epi16(0, 1,  -5, 17, 58, -10, 4, -1);
-                c = _mm_mullo_epi16(tmp, c);
-                sum1  = _mm_hadd_epi16(a, b);                   // horizontally add 8 elements in 3 steps
-                sum2  = _mm_hadd_epi16(c, c);
-                sum2  = _mm_hadd_epi16(sum1, sum2);
-                sum3  = _mm_hadd_epi16(sum2, sum2);
-                sum3  = _mm_add_epi16(sum3, vec_offset);
-                sum3  = _mm_sra_epi16(sum3, _mm_cvtsi32_si128(shift));
-                intA[col] = _mm_cvtsi128_si32(sum3);
-                intB[col] = _mm_extract_epi16(sum3, 1);
-                intC[col] = _mm_extract_epi16(sum3, 2);
-
-                tmpwlo = _mm_unpacklo_epi16(sum3, _mm_srai_epi16(sum3, 15));
-                tmpwlo = _mm_mullo_epi32(tmpwlo, vscale);
-                tmpwlo = _mm_add_epi32(tmpwlo, vround);
-                tmpwlo = _mm_srai_epi32(tmpwlo, wshift);
-                tmpwlo = _mm_add_epi32(tmpwlo, ofs);
-                sum3 = _mm_packus_epi16(_mm_packs_epi32(tmpwlo, tmpwlo), _mm_setzero_si128());
-
-                dstA[row * dstStride + col] = _mm_extract_epi8(sum3, 0);
-                dstB[row * dstStride + col] = _mm_extract_epi8(sum3, 1);
-                dstC[row * dstStride + col] = _mm_extract_epi8(sum3, 2);
-            }
-
-            tmp16a = _mm_shuffle_epi8(sum3, _mm_set1_epi8(0));
-            tmp16b = _mm_shuffle_epi8(sum3, _mm_set1_epi8(1));
-            tmp16c = _mm_shuffle_epi8(sum3, _mm_set1_epi8(2));
-        }
-        else
-        {
-            tmp16f = _mm_shuffle_epi8(tmp16f, _mm_set1_epi8(7));
-            tmp16a = _mm_shuffle_epi8(tmp16a, _mm_set1_epi8(7));
-            tmp16b = _mm_shuffle_epi8(tmp16b, _mm_set1_epi8(7));
-            tmp16c = _mm_shuffle_epi8(tmp16c, _mm_set1_epi8(7));
-        }
-        // Extend last column
-        for (int i = -marginX; i < -16; i += 16)
-        {
-            _mm_storeu_si128((__m128i*)(dstF + row * dstStride + block_width + marginX + i), tmp16f);
-            _mm_storeu_si128((__m128i*)(dstA + row * dstStride + block_width + marginX + i), tmp16a);
-            _mm_storeu_si128((__m128i*)(dstB + row * dstStride + block_width + marginX + i), tmp16b);
-            _mm_storeu_si128((__m128i*)(dstC + row * dstStride + block_width + marginX + i), tmp16c);
-        }
-
-        _mm_storeu_si128((__m128i*)(dstF + row * dstStride + block_width + marginX - 16), tmp16f); /*Assuming marginX > 16*/
-        _mm_storeu_si128((__m128i*)(dstA + row * dstStride + block_width + marginX - 16), tmp16a);
-        _mm_storeu_si128((__m128i*)(dstB + row * dstStride + block_width + marginX - 16), tmp16b);
-        _mm_storeu_si128((__m128i*)(dstC + row * dstStride + block_width + marginX - 16), tmp16c);
-
-        src += srcStride;
-        intF += intStride;
-        intA += intStride;
-        intB += intStride;
-        intC += intStride;
-    }
-
-    // Extending bottom rows
-    pixel *pe, *pi, *pp, *pf;
-    pf = dstF + (block_height - 1) * dstStride - marginX;
-    pe = dstA + (block_height - 1) * dstStride - marginX;
-    pi = dstB + (block_height - 1) * dstStride - marginX;
-    pp = dstC + (block_height - 1) * dstStride - marginX;
-    for (int y = 1; y <= marginY; y++)
-    {
-        memcpy(pf + y * dstStride, pf, block_width + marginX * 2);
-    }
-
-    for (int y = 1; y <= marginY; y++)
-    {
-        memcpy(pe + y * dstStride, pe, block_width + marginX * 2);
-    }
-
-    for (int y = 1; y <= marginY; y++)
-    {
-        memcpy(pi + y * dstStride, pi, block_width + marginX * 2);
-    }
-
-    for (int y = 1; y <= marginY; y++)
-    {
-        memcpy(pp + y * dstStride, pp, block_width + marginX * 2);
-    }
-
-    // Extending top rows
-    pf  = dstF - marginX;
-    pe  = dstA - marginX;
-    pi  = dstB - marginX;
-    pp  = dstC - marginX;
-    for (int y = 1; y <= marginY; y++)
-    {
-        memcpy(pf - y * dstStride, pf, block_width + marginX * 2);
-    }
-
-    for (int y = 1; y <= marginY; y++)
-    {
-        memcpy(pe - y * dstStride, pe, block_width + marginX * 2);
-    }
-
-    for (int y = 1; y <= marginY; y++)
-    {
-        memcpy(pi - y * dstStride, pi, block_width + marginX * 2);
-    }
-
-    for (int y = 1; y <= marginY; y++)
-    {
-        memcpy(pp - y * dstStride, pp, block_width + marginX * 2);
-    }
-}
 #endif
 }
 
@@ -2376,12 +1666,5 @@
 
     p.ipfilter_sp[FILTER_V_S_P_4] = filterVertical_s_p<4>;
     p.ipfilter_sp[FILTER_V_S_P_8] = filterVertical_s_p<8>;
-
-#if !HIGH_BIT_DEPTH
-    p.filterVwghtd = filterVerticalWeighted;
-#if !(defined(_MSC_VER) && _MSC_VER == 1500 && X86_64)
-    p.filterHwghtd = filterHorizontalWeighted;
-#endif
-#endif
 }
 }