[x265] [PATCH 1 of 4 x265] Scale pictures using bicubic or bilinear algorithm

Tue Mar 31 07:55:11 CEST 2020

# HG changeset patch
# User Pooja Venkatesan <pooja at multicorewareinc.com>
# Date 1566813889 -19800
#      Mon Aug 26 15:34:49 2019 +0530
# Node ID 96739081845ad6186aee54c61f69de1f30abff33
# Parent  00b686782ad0c60dff48a1a001417814ce9a20dd
Scale pictures using bicubic or bilinear algorithm.

diff -r 00b686782ad0 -r 96739081845a source/common/CMakeLists.txt

--- a/source/common/CMakeLists.txt	Thu Feb 13 11:57:33 2020 +0530
+++ b/source/common/CMakeLists.txt	Mon Aug 26 15:34:49 2019 +0530
@@ -168,4 +168,5 @@
     predict.cpp  predict.h
     scalinglist.cpp scalinglist.h
     quant.cpp quant.h contexts.h
-    deblock.cpp deblock.h)
+    deblock.cpp deblock.h
+    scaler.cpp scaler.h)
diff -r 00b686782ad0 -r 96739081845a source/common/scaler.cpp
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/source/common/scaler.cpp	Mon Aug 26 15:34:49 2019 +0530
@@ -0,0 +1,1087 @@
+#include "scaler.h"
+
+#if _MSC_VER
+#pragma warning(disable: 4706) // assignment within conditional
+#pragma warning(disable: 4244) // '=' : possible loss of data
+#endif
+
+#define SHORT_MIN (-(1 << 15))
+#define SHORT_MAX ((1 << 15) - 1)
+#define SHORT_MAX_10 ((1 << 10) - 1)
+
+namespace X265_NS{
+
+ScalerFilterManager::ScalerFilterManager() :
+    m_bitDepth(0),
+    m_algorithmFlags(0),
+    m_srcW(0),
+    m_srcH(0),
+    m_dstW(0),
+    m_dstH(0),
+    m_crSrcW(0),
+    m_crSrcH(0),
+    m_crDstW(0),
+    m_crDstH(0),
+    m_crSrcHSubSample(0),
+    m_crSrcVSubSample(0),
+    m_crDstHSubSample(0),
+    m_crDstVSubSample(0)
+{
+    for (int i = 0; i < m_numSlice; i++)
+        m_slices[i] = NULL;
+    for (int i = 0; i < m_numFilter; i++)
+        m_ScalerFilters[i] = NULL;
+}
+
+inline static void filter_copy_c(int64_t* filter, int64_t* filter2, int size)
+{
+    for (int i = 0; i < size; i++)
+        filter2[i] = filter[i];
+}
+
+#if X265_DEPTH == 8
+static void doScaling_c(int16_t *dst, int dstW, const uint8_t *src, const int16_t *filter, const int32_t *filterPos, int filterSize)
+{
+    for (int i = 0; i < dstW; i++)
+    {
+        int val = 0;
+        int sourcePos = filterPos[i];
+        for (int j = 0; j < filterSize; j++)
+            val += ((int)src[sourcePos + j]) * filter[filterSize * i + j];
+        // the cubic equation does overflow ...
+        dst[i] = x265_clip3(SHORT_MIN, SHORT_MAX, val >> 7);
+    }
+}
+static uint8_t clipUint8(int a)
+{
+    if (a&(~0xFF))
+        return (-a) >> 31;
+    else
+        return a;
+}
+
+static void yuv2PlaneX_c(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW)
+{
+    for (int i = 0; i < dstW; i++)
+    {
+        int val = 64 << 12;
+        for (int j = 0; j < filterSize; j++)
+            val += src[j][i] * filter[j];
+        dest[i] = clipUint8(val >> 19);
+    }
+}
+#else
+static void yuv2PlaneX_c_h(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW)
+{
+    for (int i = 0; i < dstW; i++)
+    {
+        int val = 1 << 16;
+        uint16_t* dst16bit = (uint16_t *)dest;
+        for (int j = 0; j < filterSize; j++)
+            val += src[j][i] * filter[j];
+        uint16_t d = x265_clip3(0, SHORT_MAX_10, val >> 17);
+        ((uint8_t*)(&dst16bit[i]))[0] = (d);
+        ((uint8_t*)(&dst16bit[i]))[1] = (d) >> 8;
+    }
+}
+static void doScaling_c_h(int16_t *dst, int dstW, const uint8_t *src, const int16_t *filter, const int32_t *filterPos, int filterSize)
+{
+    const uint16_t *srcLocal = (const uint16_t *)src;
+    for (int i = 0; i < dstW; i++)
+    {
+        int val = 0;
+        int sourcePos = filterPos[i];
+        for (int j = 0; j < filterSize; j++)
+            val += ((int)srcLocal[sourcePos + j]) * filter[filterSize * i + j];
+        // the cubic equation does overflow
+        dst[i] = x265_clip3(SHORT_MIN, SHORT_MAX, val >> 9);
+    }
+}
+#endif
+
+ScalerFilter::ScalerFilter() :
+    m_filtLen(0),
+    m_filtPos(NULL),
+    m_filt(NULL),
+    m_sourceSlice(NULL),
+    m_destSlice(NULL)
+{
+}
+
+ScalerFilter::~ScalerFilter()
+{
+    if (m_filtPos) {
+        delete[] m_filtPos; m_filtPos = NULL;
+    }
+    if (m_filt) {
+        delete[] m_filt; m_filt = NULL;
+    }
+}
+
+void ScalerHLumFilter::process(int sliceVer, int sliceHor)
+{
+    uint8_t ** src = m_sourceSlice->m_plane[0].lineBuf;
+    uint8_t ** dst = m_destSlice->m_plane[0].lineBuf;
+    int sourcePos = sliceVer - m_sourceSlice->m_plane[0].sliceVer;
+    int destPos = sliceVer - m_destSlice->m_plane[0].sliceVer;
+    int dstW = m_destSlice->m_width;
+    for (int i = 0; i < sliceHor; ++i)
+    {
+        m_hFilterScaler->doScaling((int16_t*)dst[destPos + i], dstW, (const uint8_t *)src[sourcePos + i], m_filt, m_filtPos, m_filtLen);
+        m_destSlice->m_plane[0].sliceHor += 1;
+    }
+}
+
+void ScalerHCrFilter::process(int sliceVer, int sliceHor)
+{
+    uint8_t ** src1 = m_sourceSlice->m_plane[1].lineBuf;
+    uint8_t ** dst1 = m_destSlice->m_plane[1].lineBuf;
+    uint8_t ** src2 = m_sourceSlice->m_plane[2].lineBuf;
+    uint8_t ** dst2 = m_destSlice->m_plane[2].lineBuf;
+
+    int sourcePos1 = sliceVer - m_sourceSlice->m_plane[1].sliceVer;
+    int destPos1 = sliceVer - m_destSlice->m_plane[1].sliceVer;
+    int sourcePos2 = sliceVer - m_sourceSlice->m_plane[2].sliceVer;
+    int destPos2 = sliceVer - m_destSlice->m_plane[2].sliceVer;
+
+    int dstW = m_destSlice->m_width >> m_destSlice->m_hCrSubSample;
+
+    for (int i = 0; i < sliceHor; ++i)
+    {
+        m_hFilterScaler->doScaling((int16_t*)dst1[destPos1 + i], dstW, src1[sourcePos1 + i], m_filt, m_filtPos, m_filtLen);
+        m_hFilterScaler->doScaling((int16_t*)dst2[destPos2 + i], dstW, src2[sourcePos2 + i], m_filt, m_filtPos, m_filtLen);
+        m_destSlice->m_plane[1].sliceHor += 1;
+        m_destSlice->m_plane[2].sliceHor += 1;
+    }
+}
+
+void VFilterScaler8Bit::yuv2PlaneX(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW)
+{
+    int IdxW = FACTOR_4;
+    int IdxF = FIL_DEF;
+
+    (dstW % 4 == 0) && (filterSize == 6) && (IdxF = FIL_6) && (IdxW = FACTOR_4);
+    (dstW % 4 == 0) && (filterSize == 8) && (IdxF = FIL_8) && (IdxW = FACTOR_4);
+
+#if X265_DEPTH == 8
+    yuv2PlaneX_c(filter, filterSize, src, dest, dstW);
+#else
+    yuv2PlaneX_c_h(filter, filterSize, src, dest, dstW);
+#endif
+}
+
+void VFilterScaler10Bit::yuv2PlaneX(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW)
+{
+    int IdxW = FACTOR_4;
+    int IdxF = FIL_DEF;
+
+    (dstW % 4 == 0) && (filterSize == 6) && (IdxF = FIL_6) && (IdxW = FACTOR_4);
+    (dstW % 4 == 0) && (filterSize == 8) && (IdxF = FIL_8) && (IdxW = FACTOR_4);
+
+#if X265_DEPTH == 8
+    yuv2PlaneX_c(filter, filterSize, src, dest, dstW);
+#else
+    yuv2PlaneX_c_h(filter, filterSize, src, dest, dstW);
+#endif
+}
+
+void ScalerVLumFilter::process(int sliceVer, int sliceHor)
+{
+    (void)sliceHor;
+    int first = X265_MAX(1 - m_filtLen, m_filtPos[sliceVer]);
+    int sp = first - m_sourceSlice->m_plane[0].sliceVer;
+    int dp = sliceVer - m_destSlice->m_plane[0].sliceVer;
+    uint8_t **src = m_sourceSlice->m_plane[0].lineBuf + sp;
+    uint8_t **dst = m_destSlice->m_plane[0].lineBuf + dp;
+    int16_t *filter = m_filt + (sliceVer * m_filtLen);
+    int dstW = m_destSlice->m_width;
+    m_vFilterScaler->yuv2PlaneX(filter, m_filtLen, (const int16_t**)src, dst[0], dstW);
+}
+
+void ScalerVCrFilter::process(int sliceVer, int sliceHor)
+{
+    (void)sliceHor;
+
+    const int crSkipMask = (1 << m_destSlice->m_vCrSubSample) - 1;
+    if (sliceVer & crSkipMask)
+        return;
+    else
+    {
+        int dstW = m_destSlice->m_width >> m_destSlice->m_hCrSubSample;
+        int crSliceVer = sliceVer >> m_destSlice->m_vCrSubSample;
+        int first = X265_MAX(1 - m_filtLen, m_filtPos[crSliceVer]);
+        int sp1 = first - m_sourceSlice->m_plane[1].sliceVer;
+        int sp2 = first - m_sourceSlice->m_plane[2].sliceVer;
+        int dp1 = crSliceVer - m_destSlice->m_plane[1].sliceVer;
+        int dp2 = crSliceVer - m_destSlice->m_plane[2].sliceVer;
+        uint8_t **src1 = m_sourceSlice->m_plane[1].lineBuf + sp1;
+        uint8_t **src2 = m_sourceSlice->m_plane[2].lineBuf + sp2;
+        uint8_t **dst1 = m_destSlice->m_plane[1].lineBuf + dp1;
+        uint8_t **dst2 = m_destSlice->m_plane[2].lineBuf + dp2;
+        int16_t *filter = m_filt + (crSliceVer * m_filtLen);
+
+        m_vFilterScaler->yuv2PlaneX((int16_t*)filter, m_filtLen, (const int16_t**)src1, dst1[0], dstW);
+        m_vFilterScaler->yuv2PlaneX((int16_t*)filter, m_filtLen, (const int16_t**)src2, dst2[0], dstW);
+    }
+}
+
+int ScalerFilter::initCoeff(int flag, int inc, int srcW, int dstW, int filtAlign, int one, int sourcePos, int destPos)
+{
+    int filterSize;
+    int filter2Size;
+    int minFilterSize;
+    int64_t *filter = NULL;
+    int64_t *filter2 = NULL;
+    const int64_t fone = 1LL << (54 - x265_min((int)X265_LOG2(srcW / dstW), 8));
+    int *outFilterSize = &m_filtLen;
+    int64_t xDstInSrc;
+    int sizeFactor = flag;
+
+    // Init filter pos, the +3 is for the MMX(+1) / SSE(+3) scaler which reads over the end
+    m_filtPos = new int32_t[dstW + 3];
+    int32_t **filterPos = &m_filtPos;
+
+    if (inc <= 1 << 16)
+        filterSize = 1 + sizeFactor; // upscale
+    else
+        filterSize = 1 + (sizeFactor * srcW + dstW - 1) / dstW;
+
+    filterSize = x265_min(filterSize, srcW - 2);
+    filterSize = x265_max(filterSize, 1);
+    filter = new int64_t[dstW * sizeof(*filter) * filterSize];
+
+    xDstInSrc = ((destPos*(int64_t)inc) >> 7) - ((sourcePos * 0x10000LL) >> 7);
+    for (int i = 0; i < dstW; i++)
+    {
+        int xx = (xDstInSrc - (filterSize - 2) * (1LL << 16)) / (1 << 17);
+        (*filterPos)[i] = xx;
+        for (int j = 0; j < filterSize; j++)
+        {
+            int64_t d = (X265_ABS(((int64_t)xx * (1 << 17)) - xDstInSrc)) << 13;
+            int64_t coeff = 0;
+
+            if (inc > 1 << 16)
+                d = d * dstW / srcW;
+
+            if (flag == 4) // BiCUBIC
+            {
+                int64_t B = (0) * (1 << 24);
+                int64_t C = (0.6) * (1 << 24);
+
+                if (d >= 1LL << 31)
+                    coeff = 0.0;
+                else
+                {
+                    int64_t dd = (d  * d) >> 30;
+                    int64_t ddd = (dd * d) >> 30;
+
+                    if (d < 1LL << 30)
+                        coeff = (12 * (1 << 24) - 9 * B - 6 * C) * ddd + (-18 * (1 << 24) + 12 * B + 6 * C) * dd + (6 * (1 << 24) - 2 * B) * (1 << 30);
+                    else
+                        coeff = (-B - 6 * C) * ddd + (6 * B + 30 * C) * dd + (-12 * B - 48 * C) * d + (8 * B + 24 * C) * (1 << 30);
+                }
+                coeff /= (1LL << 54) / fone;
+            }
+            else if (flag == 1) // BILINEAR
+            {
+                coeff = (1 << 30) - d;
+                if (coeff < 0)
+                    coeff = 0;
+                coeff *= fone >> 30;
+            }
+            else
+                assert(0);
+
+            filter[i * filterSize + j] = coeff;
+            xx++;
+        }
+        xDstInSrc += 2 * inc;
+    }
+
+    //apply src & dst Filter to filter -> filter2
+    X265_CHECK(filterSize > 0, "invalid filterSize value.\n");
+    filter2Size = filterSize;
+    filter2 = new int64_t[dstW * sizeof(*filter2) * filter2Size];
+
+    /* This is hard to read code, but much faster. Speed is crucial here */
+    int index = RES_FACTOR_DEF;
+    int size = dstW * filterSize;
+
+    (size % 4 == 0) && (index = RES_FACTOR_4);
+    (size % 8 == 0) && (index = RES_FACTOR_8);
+    (size % 16 == 0) && (index = RES_FACTOR_16);
+    (size % 32 == 0) && (index = RES_FACTOR_32);
+    (size % 64 == 0) && (index = RES_FACTOR_64);
+
+    filter_copy_c(filter, filter2, size);
+
+    delete[](filter);
+
+    // try to reduce the filter-size (step1 find size and shift left)
+    // Assume it is near normalized (*0.5 or *2.0 is OK but * 0.001 is not).
+    minFilterSize = 0;
+    for (int i = dstW - 1; i >= 0; i--)
+    {
+        int min = filter2Size;
+        int64_t cutOff = 0.0;
+
+        // get rid of near zero elements on the left by shifting left
+        for (int j = 0; j < filter2Size; j++)
+        {
+            int k;
+            cutOff += X265_ABS(filter2[i * filter2Size]);
+
+            if (cutOff > SCALER_MAX_REDUCE_CUTOFF * fone)
+                break;
+            // preserve monotonicity because the core can't handle the filter otherwise
+            if (i < dstW - 1 && (*filterPos)[i] >= (*filterPos)[i + 1])
+                break;
+
+            // move filter coefficients left
+            for (k = 1; k < filter2Size; k++)
+                filter2[i * filter2Size + k - 1] = filter2[i * filter2Size + k];
+            filter2[i * filter2Size + k - 1] = 0;
+            (*filterPos)[i]++;
+        }
+
+        cutOff = 0;
+        // count near zeros on the right
+        for (int j = filter2Size - 1; j > 0; j--)
+        {
+            cutOff += X265_ABS(filter2[i * filter2Size + j]);
+
+            if (cutOff > SCALER_MAX_REDUCE_CUTOFF * fone)
+                break;
+            min--;
+        }
+
+        if (min > minFilterSize)
+            minFilterSize = min;
+    }
+
+    X265_CHECK(minFilterSize > 0, "invalid minFilterSize value.\n");
+    filterSize = (minFilterSize + (filtAlign - 1)) & (~(filtAlign - 1));
+    X265_CHECK(filterSize > 0, "invalid filterSize value.\n");
+    filter = new int64_t[dstW*filterSize * sizeof(*filter)];
+
+    *outFilterSize = filterSize;
+
+    // try to reduce the filter-size (step2 reduce it)
+    for (int i = 0; i < dstW; i++)
+    {
+        for (int j = 0; j < filterSize; j++)
+        {
+            if (j >= filter2Size)
+                filter[i * filterSize + j] = 0;
+            else
+                filter[i * filterSize + j] = filter2[i * filter2Size + j];
+            if ((flag & SCALER_BITEXACT) && j >= minFilterSize)
+                filter[i * filterSize + j] = 0;
+        }
+    }
+
+    // fix borders
+    for (int i = 0; i < dstW; i++)
+    {
+        int j;
+        if ((*filterPos)[i] < 0)
+        {
+            // move filter coefficients left to compensate for filterPos
+            for (j = 1; j < filterSize; j++)
+            {
+                int left = x265_max(j + (*filterPos)[i], 0);
+                filter[i * filterSize + left] += filter[i * filterSize + j];
+                filter[i * filterSize + j] = 0;
+            }
+            (*filterPos)[i] = 0;
+        }
+
+        if ((*filterPos)[i] + filterSize > srcW)
+        {
+            int shift = (*filterPos)[i] + x265_min(filterSize - srcW, 0);
+            int64_t acc = 0;
+
+            for (j = filterSize - 1; j >= 0; j--)
+            {
+                if ((*filterPos)[i] + j >= srcW)
+                {
+                    acc += filter[i * filterSize + j];
+                    filter[i * filterSize + j] = 0;
+                }
+            }
+            for (j = filterSize - 1; j >= 0; j--)
+            {
+                if (j < shift)
+                    filter[i * filterSize + j] = 0;
+                else
+                    filter[i * filterSize + j] = filter[i * filterSize + j - shift];
+            }
+
+            (*filterPos)[i] -= shift;
+            filter[i * filterSize + srcW - 1 - (*filterPos)[i]] += acc;
+        }
+
+        X265_CHECK((*filterPos)[i] >= 0, "invalid: Value of (*filterPos)[%d] < 0.\n", i);
+        X265_CHECK((*filterPos)[i] < srcW, "invalid: Value of (*filterPos)[%d] > %d .\n", i, srcW);
+        if ((*filterPos)[i] + filterSize > srcW)
+        {
+            for (j = 0; j < filterSize; j++)
+            {
+                X265_CHECK(!filter[i * filterSize + j], "invalid: Value of filter[%d * filterSize + %d] != 0.\n", i, j);
+                X265_CHECK((*filterPos)[i] + j < srcW, "invalid: (*filterPos)[%d] + %d > %d .\n", i, i, srcW);
+            }
+        }
+    }
+
+    // init filter
+    m_filt = new int16_t[(dstW + 3)*(*outFilterSize)];
+    int16_t **outFilter = &m_filt;
+
+    // normalize & store in outFilter
+    for (int i = 0; i < dstW; i++)
+    {
+        int64_t error = 0;
+        int64_t sum = 0;
+
+        for (int j = 0; j < filterSize; j++)
+            sum += filter[i * filterSize + j];
+        sum = (sum + one / 2) / one;
+        if (!sum)
+        {
+            x265_log(NULL, X265_LOG_WARNING, "Scaler: zero vector in scaling\n");
+            sum = 1;
+        }
+        for (int j = 0; j < *outFilterSize; j++)
+        {
+            int64_t v = filter[i * filterSize + j] + error;
+            int intV = ROUNDED_DIVISION(v, sum);
+            (*outFilter)[i * (*outFilterSize) + j] = intV;
+            error = v - intV * sum;
+        }
+    }
+
+    (*filterPos)[dstW + 0] =
+        (*filterPos)[dstW + 1] =
+        (*filterPos)[dstW + 2] = (*filterPos)[dstW - 1];
+    for (int i = 0; i < *outFilterSize; i++)
+    {
+        int k = (dstW - 1) * (*outFilterSize) + i;
+        (*outFilter)[k + 1 * (*outFilterSize)] =
+            (*outFilter)[k + 2 * (*outFilterSize)] =
+            (*outFilter)[k + 3 * (*outFilterSize)] = (*outFilter)[k];
+    }
+
+    delete[](filter);
+    delete[](filter2);
+    return 0;
+}
+
+int ScalerFilterManager::init(int algorithmFlags, VideoDesc *srcVideoDesc, VideoDesc *dstVideoDesc)
+{
+    int srcW = m_srcW = srcVideoDesc->m_width;
+    int srcH = m_srcH = srcVideoDesc->m_height;
+    int dstW = m_dstW = dstVideoDesc->m_width;
+    int dstH = m_dstH = dstVideoDesc->m_height;
+    int lumXInc, crXInc;
+    int lumYInc, crYInc;
+    int  srcHCrPos;
+    int  dstHCrPos;
+    int  srcVCrPos;
+    int  dstVCrPos;
+    int dst_stride = SCALER_ALIGN(dstW * sizeof(int16_t) + 66, 16);
+    m_bitDepth = dstVideoDesc->m_inputDepth;
+    if (m_bitDepth == 16)
+        dst_stride <<= 1;
+
+    m_algorithmFlags = algorithmFlags;
+    lumXInc = (((int64_t)srcW << 16) + (dstW >> 1)) / dstW;
+    lumYInc = (((int64_t)srcH << 16) + (dstH >> 1)) / dstH;
+
+    srcHCrPos = -513;
+    dstHCrPos = -513;
+    srcVCrPos = -513;
+    dstVCrPos = -513;
+
+    int srcCsp = srcVideoDesc->m_csp;
+    if (x265_cli_csps[srcCsp].planes > 1)
+    {
+        m_crSrcHSubSample = x265_cli_csps[srcCsp].width[1];
+        m_crSrcVSubSample = x265_cli_csps[srcCsp].height[1];
+        m_crSrcW = srcVideoDesc->m_width >> m_crSrcHSubSample;
+        m_crSrcH = srcVideoDesc->m_height >> m_crSrcVSubSample;
+        if (srcCsp == 1)// i420
+            srcVCrPos = 128;
+    }
+    else
+    {
+        m_crSrcW = 0;
+        m_crSrcH = 0;
+        m_crSrcHSubSample = 0;
+        m_crSrcVSubSample = 0;
+    }
+    int dstCsp = dstVideoDesc->m_csp;
+    if (x265_cli_csps[dstCsp].planes > 1)
+    {
+        m_crDstHSubSample = x265_cli_csps[dstCsp].width[1];
+        m_crDstVSubSample = x265_cli_csps[dstCsp].height[1];
+        m_crDstW = dstVideoDesc->m_width >> m_crDstHSubSample;
+        m_crDstH = dstVideoDesc->m_height >> m_crDstVSubSample;
+        if (dstCsp == 1)// i420
+            dstVCrPos = 128;
+    }
+    else
+    {
+        m_crDstW = 0;
+        m_crDstH = 0;
+        m_crDstHSubSample = 0;
+        m_crDstVSubSample = 0;
+    }
+    // Only srcCsp == dstCsp is supported at present
+    if (srcCsp != dstCsp)
+    {
+        x265_log(NULL, X265_LOG_ERROR, "wrong, source csp != destination csp \n");
+        return false;
+    }
+
+    lumXInc = (((int64_t)srcW << 16) + (dstW >> 1)) / dstW;
+    lumYInc = (((int64_t)srcH << 16) + (dstH >> 1)) / dstH;
+    crXInc = (((int64_t)m_crSrcW << 16) + (m_crDstW >> 1)) / m_crDstW;
+    crYInc = (((int64_t)m_crSrcH << 16) + (m_crDstH >> 1)) / m_crDstH;
+
+    const int filterAlign = 1;
+
+    // init horizontal Luma Scaler filter
+    m_ScalerFilters[0] = new ScalerHLumFilter(m_bitDepth);
+    m_ScalerFilters[0]->initCoeff(m_algorithmFlags, lumXInc, srcW, dstW, filterAlign, 1 << 14, getLocalPos(0, 0), getLocalPos(0, 0));
+
+    // init horizontal cr Scaler filter
+    m_ScalerFilters[1] = new ScalerHCrFilter(m_bitDepth);
+    m_ScalerFilters[1]->initCoeff(m_algorithmFlags, crXInc, m_crSrcW, m_crDstW, filterAlign, 1 << 14,
+        getLocalPos(m_crSrcHSubSample, srcHCrPos), getLocalPos(m_crDstHSubSample, dstHCrPos));
+
+    // init vertical Luma scaler filter
+    m_ScalerFilters[2] = new ScalerVLumFilter(m_bitDepth);
+    m_ScalerFilters[2]->initCoeff(m_algorithmFlags, lumYInc, srcH, dstH, filterAlign, 1 << 12, getLocalPos(0, 0), getLocalPos(0, 0));
+
+    // init vertical cr scaler filter
+    m_ScalerFilters[3] = new ScalerVCrFilter(m_bitDepth);
+    m_ScalerFilters[3]->initCoeff(m_algorithmFlags, crYInc, m_crSrcH, m_crDstH, filterAlign, 1 << 12,
+        getLocalPos(m_crSrcVSubSample, srcVCrPos), getLocalPos(m_crDstVSubSample, dstVCrPos));
+
+    // init slice, must after filter initialization
+    initScalerSlice();
+
+    // set slice
+    m_ScalerFilters[0]->setSlice(m_slices[0], m_slices[1]);
+    m_ScalerFilters[1]->setSlice(m_slices[0], m_slices[1]);
+
+    m_ScalerFilters[2]->setSlice(m_slices[1], m_slices[2]);
+    m_ScalerFilters[3]->setSlice(m_slices[1], m_slices[2]);
+
+    return 0;
+}
+
+void HFilterScaler8Bit::doScaling(int16_t *dst, int dstW, const uint8_t *src, const int16_t *filter, const int32_t *filterPos, int filterSize)
+{
+    int IdxW = FACTOR_4;
+    int IdxF = FIL_DEF;
+
+    /* This is hard to read code, but much faster. Speed is crucial here */
+    (dstW % 8 == 0) && (filterSize == 6) && (IdxF = FIL_6) && (IdxW = FACTOR_8);
+    (dstW % 8 == 0) && (filterSize == 8) && (IdxF = FIL_8) && (IdxW = FACTOR_8);
+    (dstW % 8 == 0) && (filterSize == 16) && (IdxF = FIL_16) && (IdxW = FACTOR_8);
+    (dstW % 8 == 0) && (filterSize == 11) && (IdxF = FIL_11) && (IdxW = FACTOR_8);
+    (dstW % 8 == 0) && (filterSize == 10) && (IdxF = FIL_10) && (IdxW = FACTOR_8);
+    (dstW % 8 == 0) && (filterSize == 9) && (IdxF = FIL_9) && (IdxW = FACTOR_8);
+    (dstW % 8 == 0) && (filterSize == 15) && (IdxF = FIL_15) && (IdxW = FACTOR_8);
+    (dstW % 8 == 0) && (filterSize == 13) && (IdxF = FIL_13) && (IdxW = FACTOR_8);
+
+    /* Do not check multiple of width 4, if width is already multiple of 8 */
+    !(dstW % 8 == 0) && (dstW % 4 == 0) && (filterSize == 6) && (IdxF = FIL_6) && (IdxW = FACTOR_4);
+    !(dstW % 8 == 0) && (dstW % 4 == 0) && (filterSize == 8) && (IdxF = FIL_8) && (IdxW = FACTOR_4);
+    !(dstW % 8 == 0) && (dstW % 4 == 0) && (filterSize == 16) && (IdxF = FIL_16) && (IdxW = FACTOR_4);
+
+    (dstW % 4 == 0) && (filterSize == 24) && (IdxF = FIL_24) && (IdxW = FACTOR_4);
+    (dstW % 4 == 0) && (filterSize == 22) && (IdxF = FIL_22) && (IdxW = FACTOR_4);
+    (dstW % 4 == 0) && (filterSize == 19) && (IdxF = FIL_19) && (IdxW = FACTOR_4);
+    (dstW % 4 == 0) && (filterSize == 17) && (IdxF = FIL_17) && (IdxW = FACTOR_4);
+
+#if X265_DEPTH == 8
+    doScaling_c(dst, dstW, src, filter, filterPos, filterSize);
+#else
+    doScaling_c_h(dst, dstW, src, filter, filterPos, filterSize);
+#endif
+}
+
+void HFilterScaler10Bit::doScaling(int16_t *dst, int dstW, const uint8_t *src, const int16_t *filter, const int32_t *filterPos, int filterSize)
+{
+    int IdxW = FACTOR_4;
+    int IdxF = FIL_DEF;
+
+    /* This is hard to read code, but much faster. Speed is crucial here */
+    (dstW % 8 == 0) && (filterSize == 6) && (IdxF = FIL_6) && (IdxW = FACTOR_8);
+    (dstW % 8 == 0) && (filterSize == 8) && (IdxF = FIL_8) && (IdxW = FACTOR_8);
+    (dstW % 8 == 0) && (filterSize == 16) && (IdxF = FIL_16) && (IdxW = FACTOR_8);
+    (dstW % 8 == 0) && (filterSize == 11) && (IdxF = FIL_11) && (IdxW = FACTOR_8);
+    (dstW % 8 == 0) && (filterSize == 10) && (IdxF = FIL_10) && (IdxW = FACTOR_8);
+    (dstW % 8 == 0) && (filterSize == 9) && (IdxF = FIL_9) && (IdxW = FACTOR_8);
+    (dstW % 8 == 0) && (filterSize == 15) && (IdxF = FIL_15) && (IdxW = FACTOR_8);
+    (dstW % 8 == 0) && (filterSize == 13) && (IdxF = FIL_13) && (IdxW = FACTOR_8);
+
+    /* Do not check multiple of width 4, if width is already multiple of 8 */
+    !(dstW % 8 == 0) && (dstW % 4 == 0) && (filterSize == 6) && (IdxF = FIL_6) && (IdxW = FACTOR_4);
+    !(dstW % 8 == 0) && (dstW % 4 == 0) && (filterSize == 8) && (IdxF = FIL_8) && (IdxW = FACTOR_4);
+    !(dstW % 8 == 0) && (dstW % 4 == 0) && (filterSize == 16) && (IdxF = FIL_16) && (IdxW = FACTOR_4);
+
+    (dstW % 4 == 0) && (filterSize == 24) && (IdxF = FIL_24) && (IdxW = FACTOR_4);
+    (dstW % 4 == 0) && (filterSize == 22) && (IdxF = FIL_22) && (IdxW = FACTOR_4);
+    (dstW % 4 == 0) && (filterSize == 19) && (IdxF = FIL_19) && (IdxW = FACTOR_4);
+    (dstW % 4 == 0) && (filterSize == 17) && (IdxF = FIL_17) && (IdxW = FACTOR_4);
+
+#if X265_DEPTH == 8
+    doScaling_c(dst, dstW, src, filter, filterPos, filterSize);
+#else
+    doScaling_c_h(dst, dstW, src, filter, filterPos, filterSize);
+#endif
+}
+
+int ScalerFilterManager::scale_pic(void ** src, void ** dst, int * srcStride, int * dstStride)
+{
+    uint8_t** src_8bit, **dst_8bit;
+    src_8bit = (uint8_t**)src;
+    dst_8bit = (uint8_t**)dst;
+    if (!src_8bit || !dst_8bit)
+        return -1;
+
+    const int srcsliceHor = m_srcH;
+    const int dstW = m_dstW;
+    const int dstH = m_dstH;
+    int32_t *vLumFilterPos = m_ScalerFilters[2]->m_filtPos;
+    int32_t *vCrFilterPos = m_ScalerFilters[3]->m_filtPos;
+    const int vLumFilterSize = m_ScalerFilters[2]->m_filtLen;
+    const int vCrFilterSize = m_ScalerFilters[3]->m_filtLen;
+    const int crSrcsliceHor = UH_CEIL_SHIFTR(srcsliceHor, m_crSrcVSubSample);
+
+    // vars which will change and which we need to store back in the context
+    int lumBufIndex = -1;
+    int crBufIndex = -1;
+    int lastInLumBuf = -1;
+    int lastInCrBuf = -1;
+
+    int hasLumHoles = 1;
+    int hasCrHoles = 1;
+
+    ScalerSlice *src_slice = m_slices[0];
+    ScalerSlice *hout_slice = m_slices[1];
+    ScalerSlice *vout_slice = m_slices[2];
+    src_slice->initFromSrc((uint8_t**)src, srcStride, m_srcW, 0, srcsliceHor, 0, crSrcsliceHor, 1);
+    vout_slice->initFromSrc((uint8_t**)dst, dstStride, m_dstW, 0, dstH, 0, UH_CEIL_SHIFTR(dstH, m_crDstVSubSample), 0);
+
+    hout_slice->m_plane[0].sliceVer = 0;
+    hout_slice->m_plane[1].sliceVer = 0;
+    hout_slice->m_plane[2].sliceVer = 0;
+    hout_slice->m_plane[3].sliceVer = 0;
+    hout_slice->m_plane[0].sliceHor = 0;
+    hout_slice->m_plane[1].sliceHor = 0;
+    hout_slice->m_plane[2].sliceHor = 0;
+    hout_slice->m_plane[3].sliceHor = 0;
+    hout_slice->m_width = dstW;
+
+    for (int dstY = 0; dstY < dstH; dstY++)
+    {
+        const int crDstY = dstY >> m_crDstVSubSample;
+        const int firstLumSrcY = x265_max(1 - vLumFilterSize, vLumFilterPos[dstY]);
+        const int firstLumSrcY2 = x265_max(1 - vLumFilterSize, vLumFilterPos[x265_min(dstY | ((1 << m_crDstVSubSample) - 1), dstH - 1)]);
+        const int firstCrSrcY = x265_max(1 - vCrFilterSize, vCrFilterPos[crDstY]);
+
+        int lastLumSrcY = x265_min(m_srcH, firstLumSrcY + vLumFilterSize) - 1;
+        int lastLumSrcY2 = x265_min(m_srcH, firstLumSrcY2 + vLumFilterSize) - 1;
+        int lastCrSrcY = x265_min(m_crSrcH, firstCrSrcY + vCrFilterSize) - 1;
+
+        // handle holes
+        if (firstLumSrcY > lastInLumBuf)
+        {
+            hasLumHoles = lastInLumBuf != firstLumSrcY - 1;
+            if (hasLumHoles)
+            {
+                hout_slice->m_plane[0].sliceVer = firstLumSrcY;
+                hout_slice->m_plane[3].sliceVer = firstLumSrcY;
+                hout_slice->m_plane[0].sliceHor =
+                    hout_slice->m_plane[3].sliceHor = 0;
+            }
+
+            lastInLumBuf = firstLumSrcY - 1;
+        }
+        if (firstCrSrcY > lastInCrBuf)
+        {
+            hasCrHoles = lastInCrBuf != firstCrSrcY - 1;
+            if (hasCrHoles)
+            {
+                hout_slice->m_plane[1].sliceVer = firstCrSrcY;
+                hout_slice->m_plane[2].sliceVer = firstCrSrcY;
+                hout_slice->m_plane[1].sliceHor =
+                    hout_slice->m_plane[2].sliceHor = 0;
+            }
+
+            lastInCrBuf = firstCrSrcY - 1;
+        }
+
+        // Do we have enough lines in this slice to output the dstY line
+        int enoughLines = lastLumSrcY2 < 0 + srcsliceHor && lastCrSrcY < UH_CEIL_SHIFTR(0 + srcsliceHor, m_crSrcVSubSample);
+        if (!enoughLines)
+        {
+            lastLumSrcY = 0 + srcsliceHor - 1;
+            lastCrSrcY = 0 + crSrcsliceHor - 1;
+            x265_log(NULL, X265_LOG_INFO, "buffering slice: lastLumSrcY %d lastCrSrcY %d\n", lastLumSrcY, lastCrSrcY);
+        }
+
+        X265_CHECK(((lastLumSrcY - firstLumSrcY + 1) <= hout_slice->m_plane[0].availLines), "invalid value %d", lastLumSrcY - firstLumSrcY + 1);
+        X265_CHECK((lastCrSrcY - firstCrSrcY + 1) <= hout_slice->m_plane[1].availLines, "invalid value %d", lastCrSrcY - firstCrSrcY + 1);
+
+        int firstPosY, lastPosY, firstCPosY, lastCPosY;
+        int posY = hout_slice->m_plane[0].sliceVer + hout_slice->m_plane[0].sliceHor;
+        if (posY <= lastLumSrcY && !hasLumHoles)
+        {
+            firstPosY = x265_max(firstLumSrcY, posY);
+            lastPosY = x265_min(firstLumSrcY + hout_slice->m_plane[0].availLines - 1, 0 + srcsliceHor - 1);
+        }
+        else
+        {
+            firstPosY = posY;
+            lastPosY = lastLumSrcY;
+        }
+
+        int cPosY = hout_slice->m_plane[1].sliceVer + hout_slice->m_plane[1].sliceHor;
+        if (cPosY <= lastCrSrcY && !hasCrHoles)
+        {
+            firstCPosY = x265_max(firstCrSrcY, cPosY);
+            lastCPosY = x265_min(firstCrSrcY + hout_slice->m_plane[1].availLines - 1, UH_CEIL_SHIFTR(0 + srcsliceHor, m_crSrcVSubSample) - 1);
+        }
+        else
+        {
+            firstCPosY = cPosY;
+            lastCPosY = lastCrSrcY;
+        }
+
+        hout_slice->rotate(lastPosY, lastCPosY);
+        // horizontal luma scale
+        if (posY < lastLumSrcY + 1)
+            m_ScalerFilters[0]->process(firstPosY, lastPosY - firstPosY + 1);
+
+        lumBufIndex += lastLumSrcY - lastInLumBuf;
+        lastInLumBuf = lastLumSrcY;
+        // horizontal chroma Scale
+        if (cPosY < lastCrSrcY + 1)
+            m_ScalerFilters[1]->process(firstCPosY, lastCPosY - firstCPosY + 1);
+
+        crBufIndex += lastCrSrcY - lastInCrBuf;
+        lastInCrBuf = lastCrSrcY;
+
+        // wrap buf index around to stay inside the ring buffer
+        if (lumBufIndex >= vLumFilterSize)
+            lumBufIndex -= vLumFilterSize;
+        if (crBufIndex >= vCrFilterSize)
+            crBufIndex -= vCrFilterSize;
+        if (!enoughLines)
+            break;  // we can't output a dstY line so let's try with the next slice
+
+        // vertical scale(output converter)
+        for (int i = 2; i < m_numFilter; ++i)
+            m_ScalerFilters[i]->process(dstY, 1);
+    }
+    return 0;
+}
+
+void ScalerFilterManager::getMinBufferSize(int *out_lum_size, int *out_cr_size)
+{
+    int lumY;
+    int dstH = m_dstH;
+    int crDstH = m_crDstH;
+    int *lumFilterPos = m_ScalerFilters[2]->m_filtPos;
+    int *crFilterPos = m_ScalerFilters[3]->m_filtPos;
+    int lumFilterSize = m_ScalerFilters[2]->m_filtLen;
+    int crFilterSize = m_ScalerFilters[3]->m_filtLen;
+    int crSubSample = m_crSrcVSubSample;
+
+    *out_lum_size = lumFilterSize;
+    *out_cr_size = crFilterSize;
+
+    for (lumY = 0; lumY < dstH; lumY++)
+    {
+        int crY = (int64_t)lumY * crDstH / dstH;
+        int nextSlice = x265_max(lumFilterPos[lumY] + lumFilterSize - 1, ((crFilterPos[crY] + crFilterSize - 1) << crSubSample));
+
+        nextSlice >>= crSubSample;
+        nextSlice <<= crSubSample;
+        (*out_lum_size) = x265_max((*out_lum_size), nextSlice - lumFilterPos[lumY]);
+        (*out_cr_size) = x265_max((*out_cr_size), (nextSlice >> crSubSample) - crFilterPos[crY]);
+    }
+}
+
+int ScalerFilterManager::initScalerSlice()
+{
+    int ret = 0;
+    int dst_stride = SCALER_ALIGN(m_dstW * sizeof(int16_t) + 66, 16);
+    if (m_bitDepth == 16)
+        dst_stride <<= 1;
+
+    int lumBufSize;
+    int crBufSize;
+    int vLumFilterSize = m_ScalerFilters[2]->m_filtLen; // Vertical filter size for luma pixels.
+    int vCrFilterSize = m_ScalerFilters[3]->m_filtLen;  // Vertical filter size for chroma pixels.
+    getMinBufferSize(&lumBufSize, &crBufSize);
+    lumBufSize = X265_MAX(lumBufSize, vLumFilterSize + MAX_NUM_LINES_AHEAD);
+    crBufSize = X265_MAX(crBufSize, vCrFilterSize + MAX_NUM_LINES_AHEAD);
+
+    for (int i = 0; i < m_numSlice; i++)
+        m_slices[i] = new ScalerSlice;
+    ret = m_slices[0]->create(m_srcH, m_crSrcH, m_crSrcHSubSample, m_crSrcVSubSample, 0);
+    if (ret < 0)
+    {
+        x265_log(NULL, X265_LOG_ERROR, "alloc_slice m_slice[0] failed\n");
+        return -1;
+    }
+
+    // horizontal scaler output
+    ret = m_slices[1]->create(lumBufSize, crBufSize, m_crDstHSubSample, m_crDstVSubSample, 1);
+    if (ret < 0)
+    {
+        x265_log(NULL, X265_LOG_ERROR, "m_slice[1].create failed\n");
+        return -1;
+    }
+    ret = m_slices[1]->createLines(dst_stride, m_dstW);
+    if (ret < 0)
+    {
+        x265_log(NULL, X265_LOG_ERROR, "m_slice[1].createLines failed\n");
+        return -1;
+    }
+
+    m_slices[1]->fillOnes(dst_stride >> 1, m_bitDepth == 16);
+
+    // vertical scaler output
+    ret = m_slices[2]->create(m_dstH, m_crDstH, m_crDstHSubSample, m_crDstVSubSample, 0);
+    if (ret < 0)
+    {
+        x265_log(NULL, X265_LOG_ERROR, "m_slice[2].create failed\n");
+        return -1;
+    }
+
+    return 0;
+}
+
+int ScalerFilterManager::getLocalPos(int crSubSample, int pos)
+{
+    if (pos == -1 || pos <= -513)
+        pos = (128 << crSubSample) - 128;
+    pos += 128; // relative to ideal left edge
+    return pos >> crSubSample;
+}
+
+ScalerSlice::ScalerSlice() :
+    m_width(0),
+    m_hCrSubSample(0),
+    m_vCrSubSample(0),
+    m_isRing(0),
+    m_destroyLines(0)
+{
+    for (int i = 0; i < m_numSlicePlane; i++)
+    {
+        m_plane[i].availLines = 0;
+        m_plane[i].sliceVer = 0;
+        m_plane[i].sliceHor = 0;
+        m_plane[i].lineBuf = NULL;
+    }
+}
+
+void ScalerSlice::destroy()
+{
+    if (m_destroyLines)
+        destroyLines();
+    for (int i = 0; i < m_numSlicePlane; i++)
+    {
+        if (m_plane[i].lineBuf)
+            X265_FREE(m_plane[i].lineBuf);
+    }
+}
+
+int ScalerSlice::create(int lumLines, int crLines, int h_sub_sample, int v_sub_sample, int ring)
+{
+    int i;
+    int size[4] = { lumLines, crLines, crLines, lumLines };
+
+    m_hCrSubSample = h_sub_sample;
+    m_vCrSubSample = v_sub_sample;
+    m_isRing = ring;
+    m_destroyLines = 0;
+
+    for (i = 0; i < m_numSlicePlane; ++i)
+    {
+        int n = size[i] * (ring == 0 ? 1 : 3);
+        m_plane[i].lineBuf = X265_MALLOC(uint8_t*, n);
+        if (!m_plane[i].lineBuf)
+            return -1;
+
+        m_plane[i].availLines = size[i];
+        m_plane[i].sliceVer = 0;
+        m_plane[i].sliceHor = 0;
+    }
+    return 0;
+}
+
+/*
+slice lines contains extra bytes for vectorial code thus @size
+is the allocated memory size and @width is the number of pixels
+*/
+int ScalerSlice::createLines(int size, int width)
+{
+    int i;
+    int idx[2] = { 3, 2 };
+
+    m_destroyLines = 1;
+    m_width = width;
+
+    for (i = 0; i < 2; ++i) {
+        int n = m_plane[i].availLines;
+        int j;
+        int ii = idx[i];
+        assert(n == m_plane[ii].availLines);
+        for (j = 0; j < n; ++j)
+        {
+            // chroma plane line U and V are expected to be contiguous in memory
+            m_plane[i].lineBuf[j] = (uint8_t*)X265_MALLOC(uint8_t, size * 2 + 32);
+            if (!m_plane[i].lineBuf[j])
+            {
+                destroyLines();
+                return -1;
+            }
+            m_plane[ii].lineBuf[j] = m_plane[i].lineBuf[j] + size + 16;
+            if (m_isRing)
+            {
+                m_plane[i].lineBuf[j + n] = m_plane[i].lineBuf[j];
+                m_plane[ii].lineBuf[j + n] = m_plane[ii].lineBuf[j];
+            }
+        }
+    }
+
+    return 0;
+}
+
+void ScalerSlice::destroyLines()
+{
+    int i;
+    for (i = 0; i < 2; ++i)
+    {
+        int n = m_plane[i].availLines;
+        int j;
+        for (j = 0; j < n; ++j)
+        {
+            X265_FREE(m_plane[i].lineBuf[j]);
+            m_plane[i].lineBuf[j] = NULL;
+            if (m_isRing)
+                m_plane[i].lineBuf[j + n] = NULL;
+        }
+    }
+
+    for (i = 0; i < m_numSlicePlane; ++i)
+        memset(m_plane[i].lineBuf, 0, sizeof(uint8_t*) * m_plane[i].availLines * (m_isRing ? 3 : 1));
+    m_destroyLines = 0;
+}
+
+void ScalerSlice::fillOnes(int n, int is16bit)
+{
+    int i;
+    for (i = 0; i < m_numSlicePlane; ++i)
+    {
+        int j;
+        int size = m_plane[i].availLines;
+        for (j = 0; j < size; ++j)
+        {
+            int k;
+            int end = is16bit ? n >> 1 : n;
+            // fill also one extra element
+            end += 1;
+            if (is16bit)
+                for (k = 0; k < end; ++k)
+                    ((int32_t*)(m_plane[i].lineBuf[j]))[k] = 1 << 18;
+            else
+                for (k = 0; k < end; ++k)
+                    ((int16_t*)(m_plane[i].lineBuf[j]))[k] = 1 << 14;
+        }
+    }
+}
+
+int ScalerSlice::rotate(int lum, int cr)
+{
+    int i;
+    if (lum)
+    {
+        for (i = 0; i < m_numSlicePlane; i += 3)
+        {
+            int n = m_plane[i].availLines;
+            int l = lum - m_plane[i].sliceVer;
+
+            if (l >= n * 2)
+            {
+                m_plane[i].sliceVer += n;
+                m_plane[i].sliceHor -= n;
+            }
+        }
+    }
+    if (cr)
+    {
+        for (i = 1; i < 3; ++i)
+        {
+            int n = m_plane[i].availLines;
+            int l = cr - m_plane[i].sliceVer;
+
+            if (l >= n * 2)
+            {
+                m_plane[i].sliceVer += n;
+                m_plane[i].sliceHor -= n;
+            }
+        }
+    }
+    return 0;
+}
+
+int ScalerSlice::initFromSrc(uint8_t *src[4], const int stride[4], int srcW, int lumY, int lumH, int crY, int crH, int relative)
+{
+    int i = 0;
+
+    const int start[m_numSlicePlane] = { lumY, crY, crY, lumY };
+
+    const int end[m_numSlicePlane] = { lumY + lumH, crY + crH, crY + crH, lumY + lumH };
+
+    uint8_t *const src_[m_numSlicePlane] = { src[0] + (relative ? 0 : start[0]) * stride[0],
+        src[1] + (relative ? 0 : start[1]) * stride[1],
+        src[2] + (relative ? 0 : start[2]) * stride[2],
+        src[3] + (relative ? 0 : start[3]) * stride[3] };
+
+    m_width = srcW;
+
+    for (i = 0; i < m_numSlicePlane; ++i)
+    {
+        int j;
+        int first = m_plane[i].sliceVer;
+        int n = m_plane[i].availLines;
+        int lines = end[i] - start[i];
+        int tot_lines = end[i] - first;
+
+        if (start[i] >= first && n >= tot_lines)
+        {
+            m_plane[i].sliceHor = x265_max(tot_lines, m_plane[i].sliceHor);
+            for (j = 0; j < lines; j += 1)
+                m_plane[i].lineBuf[start[i] - first + j] = src_[i] + j * stride[i];
+        }
+        else
+        {
+            m_plane[i].sliceVer = start[i];
+            lines = lines > n ? n : lines;
+            m_plane[i].sliceHor = lines;
+            for (j = 0; j < lines; j += 1)
+                m_plane[i].lineBuf[j] = src_[i] + j * stride[i];
+        }
+    }
+    return 0;
+}
+}
diff -r 00b686782ad0 -r 96739081845a source/common/scaler.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/source/common/scaler.h	Mon Aug 26 15:34:49 2019 +0530
@@ -0,0 +1,254 @@
+/*****************************************************************************
+ * Copyright (C) 2020 MulticoreWare, Inc
+ *
+ * Authors: Pooja Venkatesan <pooja at multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_SCALER_H
+#define X265_SCALER_H
+
+#include "common.h"
+
+namespace X265_NS {
+//x265 private namespace
+
+class ScalerSlice;
+class VideoDesc;
+
+#define MAX_NUM_LINES_AHEAD 4
+#define SCALER_ALIGN(x, j) (((x)+(j)-1)&~((j)-1))
+#define X265_ABS(j) ((j) >= 0 ? (j) : (-(j)))
+#define SCALER_MAX_REDUCE_CUTOFF 0.002
+#define SCALER_BITEXACT  0x80000
+#define ROUNDED_DIVISION(i,j) (((i)>0 ? (i) + ((j)>>1) : (i) - ((j)>>1))/(j))
+#define UH_CEIL_SHIFTR(i,j) (!scale_builtin_constant_p(j) ? -((-(i)) >> (j)) \
+                                                          : ((i) + (1<<(j)) - 1) >> (j))
+
+#if defined(__GNUC__) || defined(__clang__)
+#    define scale_builtin_constant_p __builtin_constant_p
+#else
+#    define scale_builtin_constant_p(x) 0
+#endif
+
+enum ResFactor
+{
+    RES_FACTOR_64, RES_FACTOR_32, RES_FACTOR_16, RES_FACTOR_8,
+    RES_FACTOR_4, RES_FACTOR_DEF, NUM_RES_FACTOR
+};
+
+enum ScalerFactor
+{
+    FACTOR_4, FACTOR_8, NUM_FACTOR
+};
+
+enum FilterSize
+{
+    FIL_4, FIL_6, FIL_8, FIL_9, FIL_10, FIL_11, FIL_13, FIL_15,
+    FIL_16, FIL_17, FIL_19, FIL_22, FIL_24, FIL_DEF, NUM_FIL
+};
+
+class ScalerFilter {
+public:
+    int             m_filtLen;
+    int32_t*        m_filtPos;      // Array of horizontal/vertical starting pos for each dst for luma / chroma planes.
+    int16_t*        m_filt;         // Array of horizontal/vertical filter coefficients for luma / chroma planes.
+    ScalerSlice*    m_sourceSlice;  // Source slice
+    ScalerSlice*    m_destSlice;    // Output slice
+    ScalerFilter();
+    virtual ~ScalerFilter();
+    virtual void process(int sliceVer, int sliceHor) = 0;
+    int initCoeff(int flag, int inc, int srcW, int dstW, int filtAlign, int one, int sourcePos, int destPos);
+    void setSlice(ScalerSlice* source, ScalerSlice* dest) { m_sourceSlice = source; m_destSlice = dest; }
+};
+
+class VideoDesc {
+public:
+    int         m_width;
+    int         m_height;
+    int         m_csp;
+    int         m_inputDepth;
+
+    VideoDesc(int w, int h, int csp, int bitDepth)
+    {
+        m_width = w;
+        m_height = h;
+        m_csp = csp;
+        m_inputDepth = bitDepth;
+    }
+};
+
+typedef struct ScalerPlane
+{
+    int       availLines; // max number of lines that can be held by this plane
+    int       sliceVer;   // index of first line
+    int       sliceHor;   // number of lines
+    uint8_t** lineBuf;    // line buffer
+} ScalerPlane;
+
+// Assist horizontal filtering, base class
+class HFilterScaler {
+public:
+    int m_bitDepth;
+public:
+    HFilterScaler() :m_bitDepth(0) {};
+    virtual ~HFilterScaler() {};
+    virtual void doScaling(int16_t *dst, int dstW, const uint8_t *src, const int16_t *filter, const int32_t *filterPos, int filterSize) = 0;
+};
+
+// Assist vertical filtering, base class
+class VFilterScaler {
+public:
+    int m_bitDepth;
+public:
+    VFilterScaler() :m_bitDepth(0) {};
+    virtual ~VFilterScaler() {};
+    virtual void yuv2PlaneX(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW) = 0;
+};
+
+//  Assist horizontal filtering, process 8 bit case
+class HFilterScaler8Bit : public HFilterScaler {
+public:
+    HFilterScaler8Bit() { m_bitDepth = 8; }
+    virtual void doScaling(int16_t *dst, int dstW, const uint8_t *src, const int16_t *filter, const int32_t *filterPos, int filterSize);
+};
+
+//  Assist horizontal filtering, process 10 bit case
+class HFilterScaler10Bit : public HFilterScaler {
+public:
+    HFilterScaler10Bit() { m_bitDepth = 10; }
+    virtual void doScaling(int16_t *dst, int dstW, const uint8_t *src, const int16_t *filter, const int32_t *filterPos, int filterSize);
+};
+
+//  Assist vertical filtering, process 8 bit case
+class VFilterScaler8Bit : public VFilterScaler {
+public:
+    VFilterScaler8Bit() { m_bitDepth = 8; }
+    virtual void yuv2PlaneX(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW);
+};
+
+//  Assist vertical filtering, process 10 bit case
+class VFilterScaler10Bit : public VFilterScaler {
+public:
+    VFilterScaler10Bit() { m_bitDepth = 10; }
+    virtual void yuv2PlaneX(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW);
+};
+
+// Horizontal filter for luma
+class ScalerHLumFilter : public ScalerFilter {
+private:
+    HFilterScaler* m_hFilterScaler;
+public:
+    ScalerHLumFilter(int bitDepth) { bitDepth == 8 ? m_hFilterScaler = new HFilterScaler8Bit : bitDepth == 10 ? m_hFilterScaler = new HFilterScaler10Bit : NULL;}
+    ~ScalerHLumFilter() { if (m_hFilterScaler) X265_FREE(m_hFilterScaler); }
+    virtual void process(int sliceVer, int sliceHor);
+};
+
+// Horizontal filter for chroma
+class ScalerHCrFilter : public ScalerFilter {
+private:
+    HFilterScaler* m_hFilterScaler;
+public:
+    ScalerHCrFilter(int bitDepth) { bitDepth == 8 ? m_hFilterScaler = new HFilterScaler8Bit : bitDepth == 10 ? m_hFilterScaler = new HFilterScaler10Bit : NULL;}
+    ~ScalerHCrFilter() { if (m_hFilterScaler) X265_FREE(m_hFilterScaler); }
+    virtual void process(int sliceVer, int sliceHor);
+};
+
+// Vertical filter for luma
+class ScalerVLumFilter : public ScalerFilter {
+private:
+    VFilterScaler* m_vFilterScaler;
+public:
+    ScalerVLumFilter(int bitDepth) { bitDepth == 8 ? m_vFilterScaler = new VFilterScaler8Bit : bitDepth == 10 ? m_vFilterScaler = new VFilterScaler10Bit : NULL;}
+    ~ScalerVLumFilter() { if (m_vFilterScaler) X265_FREE(m_vFilterScaler); }
+    virtual void process(int sliceVer, int sliceHor);
+};
+
+// Vertical filter for chroma
+class ScalerVCrFilter : public ScalerFilter {
+private:
+    VFilterScaler*    m_vFilterScaler;
+public:
+    ScalerVCrFilter(int bitDepth) { bitDepth == 8 ? m_vFilterScaler = new VFilterScaler8Bit : bitDepth == 10 ? m_vFilterScaler = new VFilterScaler10Bit : NULL;}
+    ~ScalerVCrFilter() { if (m_vFilterScaler) X265_FREE(m_vFilterScaler); }
+    virtual void process(int sliceVer, int sliceHor);
+};
+
+class ScalerSlice
+{
+private:
+    enum ScalerSlicePlaneNum { m_numSlicePlane = 4 };
+public:
+    int m_width;        // Slice line width
+    int m_hCrSubSample; // horizontal Chroma subsampling factor
+    int m_vCrSubSample; // vertical chroma subsampling factor
+    int m_isRing;       // flag to identify if this ScalerSlice is a ring buffer
+    int m_destroyLines; // flag to identify if there are dynamic allocated lines
+    ScalerPlane m_plane[m_numSlicePlane];
+public:
+    ScalerSlice();
+    ~ScalerSlice() { destroy(); }
+    int rotate(int lum, int cr);
+    void fillOnes(int n, int is16bit);
+    int create(int lumLines, int crLines, int h_sub_sample, int v_sub_sample, int ring);
+    int createLines(int size, int width);
+    void destroyLines();
+    void destroy();
+    int initFromSrc(uint8_t *src[4], const int stride[4], int srcW, int lumY, int lumH, int crY, int crH, int relative);
+};
+
+class ScalerFilterManager {
+private:
+    enum ScalerFilterNum { m_numSlice = 3, m_numFilter = 4 };
+
+private:
+    int                     m_bitDepth;
+    int                     m_algorithmFlags;  // 1, bilinear; 4 bicubic, default is bicubic
+    int                     m_srcW;            // Width  of source luma planes.
+    int                     m_srcH;            // Height of source luma planes.
+    int                     m_dstW;            // Width of dest luma planes.
+    int                     m_dstH;            // Height of dest luma planes.
+    int                     m_crSrcW;          // Width  of source chroma planes.
+    int                     m_crSrcH;          // Height of source chroma planes.
+    int                     m_crDstW;          // Width  of dest chroma planes.
+    int                     m_crDstH;          // Height of dest chroma planes.
+    int                     m_crSrcHSubSample; // Binary log of horizontal subsampling factor between Y and Cr planes in src  image.
+    int                     m_crSrcVSubSample; // Binary log of vertical   subsampling factor between Y and Cr planes in src  image.
+    int                     m_crDstHSubSample; // Binary log of horizontal subsampling factor between Y and Cr planes in dest image.
+    int                     m_crDstVSubSample; // Binary log of vertical   subsampling factor between Y and Cr planes in dest image.
+    ScalerSlice*            m_slices[m_numSlice];
+    ScalerFilter*           m_ScalerFilters[m_numFilter];
+private:
+    int getLocalPos(int crSubSample, int pos);
+    void getMinBufferSize(int *out_lum_size, int *out_cr_size);
+    int initScalerSlice();
+public:
+    ScalerFilterManager();
+    ~ScalerFilterManager() {
+        for (int i = 0; i < m_numSlice; i++)
+            if (m_slices[i]) { m_slices[i]->destroy(); delete m_slices[i]; m_slices[i] = NULL; }
+        for (int i = 0; i < m_numFilter; i++)
+            if (m_ScalerFilters[i]) { delete m_ScalerFilters[i]; m_ScalerFilters[i] = NULL; }
+    }
+    int init(int algorithmFlags, VideoDesc* srcVideoDesc, VideoDesc* dstVideoDesc);
+    int scale_pic(void** src, void** dst, int* srcStride, int* dstStride);
+};
+}
+
+#endif //ifndef X265_SCALER_H
-------------- next part --------------
A non-text attachment was scrubbed...
Name: x265_push-1.patch
Type: text/x-patch
Size: 49967 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20200331/ac6246a7/attachment-0001.bin>