[x265] [PATCH] SEA Motion Search Implementation

Tue Nov 29 14:02:55 CET 2016

Please ignore the previous patch.
PFA

Thanks
Vignesh

On Tue, Nov 29, 2016 at 3:12 PM, Vignesh V Menon <
vignesh at multicorewareinc.com> wrote:

> # HG changeset patch
> # User Vignesh Vijayakumar <vignesh at multicorewareinc.com>
> # Date 1480313149 -19800
> #      Mon Nov 28 11:35:49 2016 +0530
> # Node ID f8d523976ed61cada53c579d8145a815d21d08ed
> # Parent  5d95fbd53ca31747498c4bd661fa24f6ffd5a070
> SEA motion search Implementation
>
> diff -r 5d95fbd53ca3 -r f8d523976ed6 doc/reST/cli.rst
> --- a/doc/reST/cli.rst Fri Nov 25 12:57:52 2016 +0530
> +++ b/doc/reST/cli.rst Mon Nov 28 11:35:49 2016 +0530
> @@ -964,13 +964,17 @@
>   encoder: a star-pattern search followed by an optional radix scan
>   followed by an optional star-search refinement. Full is an
>   exhaustive search; an order of magnitude slower than all other
> - searches but not much better than umh or star.
> + searches but not much better than umh or star. SEA is similar to
> + FULL search; a three step motion search adopted from x264: DC
> + calculation followed by ADS calculation followed by SAD of the
> + passed motion vector candidates, hence faster than Full search.
>
>   0. dia
>   1. hex **(default)**
>   2. umh
>   3. star
> - 4. full
> + 4. sea
> + 5. full
>
>  .. option:: --subme, -m <0..7>
>
> diff -r 5d95fbd53ca3 -r f8d523976ed6 source/common/common.h
> --- a/source/common/common.h Fri Nov 25 12:57:52 2016 +0530
> +++ b/source/common/common.h Mon Nov 28 11:35:49 2016 +0530
> @@ -328,6 +328,8 @@
>
>  #define PIXEL_MAX ((1 << X265_DEPTH) - 1)
>
> +#define INTEGRAL_PLANE_NUM          12 // 12 integral planes for 32x32,
> 32x24, 32x8, 24x32, 16x16, 16x12, 16x4, 12x16, 8x32, 8x8, 4x16 and 4x4.
> +
>  namespace X265_NS {
>
>  enum { SAO_NUM_OFFSET = 4 };
> diff -r 5d95fbd53ca3 -r f8d523976ed6 source/common/framedata.cpp
> --- a/source/common/framedata.cpp Fri Nov 25 12:57:52 2016 +0530
> +++ b/source/common/framedata.cpp Mon Nov 28 11:35:49 2016 +0530
> @@ -48,6 +48,12 @@
>      CHECKED_MALLOC_ZERO(m_cuStat, RCStatCU, sps.numCUsInFrame);
>      CHECKED_MALLOC(m_rowStat, RCStatRow, sps.numCuInHeight);
>      reinit(sps);
> +
> +    for (int i = 0; i < INTEGRAL_PLANE_NUM; i++)
> +    {
> +        m_meBuffer[i] = NULL;
> +        m_meIntegral[i] = NULL;
> +    }
>      return true;
>
>  fail:
> @@ -70,4 +76,16 @@
>
>      X265_FREE(m_cuStat);
>      X265_FREE(m_rowStat);
> +
> +    if (m_meBuffer)
> +    {
> +        for (int i = 0; i < INTEGRAL_PLANE_NUM; i++)
> +        {
> +            if (m_meBuffer[i] != NULL)
> +            {
> +                X265_FREE(m_meBuffer[i]);
> +                m_meBuffer[i] = NULL;
> +            }
> +        }
> +    }
>  }
> diff -r 5d95fbd53ca3 -r f8d523976ed6 source/common/framedata.h
> --- a/source/common/framedata.h Fri Nov 25 12:57:52 2016 +0530
> +++ b/source/common/framedata.h Mon Nov 28 11:35:49 2016 +0530
> @@ -151,6 +151,9 @@
>      double         m_rateFactor; /* calculated based on the Frame QP */
>      int            m_picCsp;
>
> +    uint32_t*              m_meIntegral[INTEGRAL_PLANE_NUM];       // 12
> integral planes for 32x32, 32x24, 32x8, 24x32, 16x16, 16x12, 16x4, 12x16,
> 8x32, 8x8, 4x16 and 4x4.
> +    uint32_t*              m_meBuffer[INTEGRAL_PLANE_NUM];
> +
>      FrameData();
>
>      bool create(const x265_param& param, const SPS& sps, int csp);
> diff -r 5d95fbd53ca3 -r f8d523976ed6 source/common/param.cpp
> --- a/source/common/param.cpp Fri Nov 25 12:57:52 2016 +0530
> +++ b/source/common/param.cpp Mon Nov 28 11:35:49 2016 +0530
> @@ -1092,8 +1092,8 @@
>            "Frame rate numerator and denominator must be specified");
>      CHECK(param->interlaceMode < 0 || param->interlaceMode > 2,
>            "Interlace mode must be 0 (progressive) 1 (top-field first) or
> 2 (bottom field first)");
> -    CHECK(param->searchMethod<0 || param->searchMethod> X265_FULL_SEARCH,
> -          "Search method is not supported value (0:DIA 1:HEX 2:UMH 3:HM
> 5:FULL)");
> +    CHECK(param->searchMethod < 0 || param->searchMethod >
> X265_FULL_SEARCH,
> +          "Search method is not supported value (0:DIA 1:HEX 2:UMH 3:HM
> 4:SEA 5:FULL)");
>      CHECK(param->searchRange < 0,
>            "Search Range must be more than 0");
>      CHECK(param->searchRange >= 32768,
> @@ -1256,6 +1256,10 @@
>          "qpmin exceeds supported range (0 to 69)");
>      CHECK(param->log2MaxPocLsb < 4 || param->log2MaxPocLsb > 16,
>          "Supported range for log2MaxPocLsb is 4 to 16");
> +#if !X86_64
> +    CHECK(param->searchMethod == X265_SEA && (param->sourceWidth > 840 ||
> param->sourceHeight > 480),
> +        "SEA motion search does not support resolutions greater than 480p
> in 32 bit build");
> +#endif
>      return check_failed;
>  }
>
> diff -r 5d95fbd53ca3 -r f8d523976ed6 source/common/pixel.cpp
> --- a/source/common/pixel.cpp Fri Nov 25 12:57:52 2016 +0530
> +++ b/source/common/pixel.cpp Mon Nov 28 11:35:49 2016 +0530
> @@ -117,6 +117,52 @@
>      }
>  }
>
> +template<int lx, int ly>
> +int ads_x4(int encDC[4], uint32_t *sums, int delta, uint16_t *costMvX,
> int16_t *mvs, int width, int thresh)
> +{
> +    int nmv = 0;
> +    for (int16_t i = 0; i < width; i++, sums++)
> +    {
> +        int ads = abs(encDC[0] - long(sums[0]))
> +            + abs(encDC[1] - long(sums[lx >> 1]))
> +            + abs(encDC[2] - long(sums[delta]))
> +            + abs(encDC[3] - long(sums[delta + (lx >> 1)]))
> +            + costMvX[i];
> +        if (ads < thresh)
> +            mvs[nmv++] = i;
> +    }
> +    return nmv;
> +}
> +
> +template<int lx, int ly>
> +int ads_x2(int encDC[2], uint32_t *sums, int delta, uint16_t *costMvX,
> int16_t *mvs, int width, int thresh)
> +{
> +    int nmv = 0;
> +    for (int16_t i = 0; i < width; i++, sums++)
> +    {
> +        int ads = abs(encDC[0] - long(sums[0]))
> +            + abs(encDC[1] - long(sums[delta]))
> +            + costMvX[i];
> +        if (ads < thresh)
> +            mvs[nmv++] = i;
> +    }
> +    return nmv;
> +}
> +
> +template<int lx, int ly>
> +int ads_x1(int encDC[1], uint32_t *sums, int, uint16_t *costMvX, int16_t
> *mvs, int width, int thresh)
> +{
> +    int nmv = 0;
> +    for (int16_t i = 0; i < width; i++, sums++)
> +    {
> +        int ads = abs(encDC[0] - long(sums[0]))
> +            + costMvX[i];
> +        if (ads < thresh)
> +            mvs[nmv++] = i;
> +    }
> +    return nmv;
> +}
> +
>  template<int lx, int ly, class T1, class T2>
>  sse_t sse(const T1* pix1, intptr_t stride_pix1, const T2* pix2, intptr_t
> stride_pix2)
>  {
> @@ -991,6 +1037,32 @@
>      LUMA_PU(64, 16);
>      LUMA_PU(16, 64);
>
> +    p.pu[LUMA_4x4].ads = ads_x1<4, 4>;
> +    p.pu[LUMA_8x8].ads = ads_x1<8, 8>;
> +    p.pu[LUMA_8x4].ads = ads_x2<8, 4>;
> +    p.pu[LUMA_4x8].ads = ads_x2<4, 8>;
> +    p.pu[LUMA_16x16].ads = ads_x4<16, 16>;
> +    p.pu[LUMA_16x8].ads = ads_x2<16, 8>;
> +    p.pu[LUMA_8x16].ads = ads_x2<8, 16>;
> +    p.pu[LUMA_16x12].ads = ads_x1<16, 12>;
> +    p.pu[LUMA_12x16].ads = ads_x1<12, 16>;
> +    p.pu[LUMA_16x4].ads = ads_x1<16, 4>;
> +    p.pu[LUMA_4x16].ads = ads_x1<4, 16>;
> +    p.pu[LUMA_32x32].ads = ads_x4<32, 32>;
> +    p.pu[LUMA_32x16].ads = ads_x2<32, 16>;
> +    p.pu[LUMA_16x32].ads = ads_x2<16, 32>;
> +    p.pu[LUMA_32x24].ads = ads_x4<32, 24>;
> +    p.pu[LUMA_24x32].ads = ads_x4<24, 32>;
> +    p.pu[LUMA_32x8].ads = ads_x4<32, 8>;
> +    p.pu[LUMA_8x32].ads = ads_x4<8, 32>;
> +    p.pu[LUMA_64x64].ads = ads_x4<64, 64>;
> +    p.pu[LUMA_64x32].ads = ads_x2<64, 32>;
> +    p.pu[LUMA_32x64].ads = ads_x2<32, 64>;
> +    p.pu[LUMA_64x48].ads = ads_x4<64, 48>;
> +    p.pu[LUMA_48x64].ads = ads_x4<48, 64>;
> +    p.pu[LUMA_64x16].ads = ads_x4<64, 16>;
> +    p.pu[LUMA_16x64].ads = ads_x4<16, 64>;
> +
>      p.pu[LUMA_4x4].satd   = satd_4x4;
>      p.pu[LUMA_8x8].satd   = satd8<8, 8>;
>      p.pu[LUMA_8x4].satd   = satd_8x4;
> diff -r 5d95fbd53ca3 -r f8d523976ed6 source/common/primitives.h
> --- a/source/common/primitives.h Fri Nov 25 12:57:52 2016 +0530
> +++ b/source/common/primitives.h Mon Nov 28 11:35:49 2016 +0530
> @@ -115,6 +115,7 @@
>  typedef sse_t (*pixel_sse_t)(const pixel* fenc, intptr_t fencstride,
> const pixel* fref, intptr_t frefstride); // fenc is aligned
>  typedef sse_t (*pixel_sse_ss_t)(const int16_t* fenc, intptr_t fencstride,
> const int16_t* fref, intptr_t frefstride);
>  typedef sse_t (*pixel_ssd_s_t)(const int16_t* fenc, intptr_t fencstride);
> +typedef int(*pixelcmp_ads_t)(int encDC[], uint32_t *sums, int delta,
> uint16_t *costMvX, int16_t *mvs, int width, int thresh);
>  typedef void (*pixelcmp_x4_t)(const pixel* fenc, const pixel* fref0,
> const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t
> frefstride, int32_t* res);
>  typedef void (*pixelcmp_x3_t)(const pixel* fenc, const pixel* fref0,
> const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
>  typedef void (*blockfill_s_t)(int16_t* dst, intptr_t dstride, int16_t
> val);
> @@ -217,6 +218,7 @@
>          pixelcmp_t     sad;         // Sum of Absolute Differences
>          pixelcmp_x3_t  sad_x3;      // Sum of Absolute Differences, 3 mv
> offsets at once
>          pixelcmp_x4_t  sad_x4;      // Sum of Absolute Differences, 4 mv
> offsets at once
> +        pixelcmp_ads_t ads;         // Absolute Differences sum
>          pixelcmp_t     satd;        // Sum of Absolute Transformed
> Differences (4x4 Hadamard)
>
>          filter_pp_t    luma_hpp;    // 8-tap luma motion compensation
> interpolation filters
> @@ -402,6 +404,22 @@
>      return part;
>  }
>
> +/* Computes the size of the LumaPU for a given LumaPU enum */
> +inline void sizesFromPartition(int part, int *width, int *height)
> +{
> +    X265_CHECK(part >= 0 && part <= 24, "Invalid part %d \n", part);
> +    extern const uint8_t lumaPartitionMapTable[];
> +    int index = 0;
> +    for (int i = 0; i < 256;i++)
> +        if (part == lumaPartitionMapTable[i])
> +        {
> +            index = i;
> +            break;
> +        }
> +    *width = 4 * ((index >> 4) + 1);
> +    *height = 4 * ((index % 16) + 1);
> +}
> +
>  inline int partitionFromLog2Size(int log2Size)
>  {
>      X265_CHECK(2 <= log2Size && log2Size <= 6, "Invalid block size\n");
> diff -r 5d95fbd53ca3 -r f8d523976ed6 source/common/yuv.cpp
> --- a/source/common/yuv.cpp Fri Nov 25 12:57:52 2016 +0530
> +++ b/source/common/yuv.cpp Mon Nov 28 11:35:49 2016 +0530
> @@ -47,6 +47,11 @@
>      m_size  = size;
>      m_part = partitionFromSizes(size, size);
>
> +    for (int i = 0; i < 2; i++)
> +        for (int j = 0; j < MAX_NUM_REF; j++)
> +            for (int k = 0; k < INTEGRAL_PLANE_NUM; k++)
> +                m_integral[i][j][k] = NULL;
> +
>      if (csp == X265_CSP_I400)
>      {
>          CHECKED_MALLOC(m_buf[0], pixel, size * size + 8);
> diff -r 5d95fbd53ca3 -r f8d523976ed6 source/common/yuv.h
> --- a/source/common/yuv.h Fri Nov 25 12:57:52 2016 +0530
> +++ b/source/common/yuv.h Mon Nov 28 11:35:49 2016 +0530
> @@ -48,6 +48,7 @@
>      int      m_csp;
>      int      m_hChromaShift;
>      int      m_vChromaShift;
> +    uint32_t *m_integral[2][MAX_NUM_REF][INTEGRAL_PLANE_NUM];
>
>      Yuv();
>
> diff -r 5d95fbd53ca3 -r f8d523976ed6 source/encoder/analysis.cpp
> --- a/source/encoder/analysis.cpp Fri Nov 25 12:57:52 2016 +0530
> +++ b/source/encoder/analysis.cpp Mon Nov 28 11:35:49 2016 +0530
> @@ -943,6 +943,16 @@
>      ModeDepth& md = m_modeDepth[depth];
>      md.bestMode = NULL;
>
> +    if (m_param->searchMethod == X265_SEA)
> +    {
> +        int numPredDir = m_slice->isInterP() ? 1 : 2;
> +        int offset = (int)(m_frame->m_reconPic->m_cuOffsetY[parentCTU.m_cuAddr]
> + m_frame->m_reconPic->m_buOffsetY[cuGeom.absPartIdx]);
> +        for (int list = 0; list < numPredDir; list++)
> +            for (int i = 0; i < m_frame->m_encData->m_slice->m_numRefIdx[list];
> i++)
> +                for (int planes = 0; planes < INTEGRAL_PLANE_NUM;
> planes++)
> +                    m_modeDepth[depth].fencYuv.m_integral[list][i][planes]
> = m_frame->m_encData->m_slice->m_refFrameList[list][i]->m_encData->m_meIntegral[planes]
> + offset;
> +    }
> +
>      PicYuv& reconPic = *m_frame->m_reconPic;
>
>      bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
> @@ -1484,6 +1494,16 @@
>      ModeDepth& md = m_modeDepth[depth];
>      md.bestMode = NULL;
>
> +    if (m_param->searchMethod == X265_SEA)
> +    {
> +        int numPredDir = m_slice->isInterP() ? 1 : 2;
> +        int offset = (int)(m_frame->m_reconPic->m_cuOffsetY[parentCTU.m_cuAddr]
> + m_frame->m_reconPic->m_buOffsetY[cuGeom.absPartIdx]);
> +        for (int list = 0; list < numPredDir; list++)
> +            for (int i = 0; i < m_frame->m_encData->m_slice->m_numRefIdx[list];
> i++)
> +                for (int planes = 0; planes < INTEGRAL_PLANE_NUM;
> planes++)
> +                    m_modeDepth[depth].fencYuv.m_integral[list][i][planes]
> = m_frame->m_encData->m_slice->m_refFrameList[list][i]->m_encData->m_meIntegral[planes]
> + offset;
> +    }
> +
>      bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);
>      bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
>      bool skipRecursion = false;
> diff -r 5d95fbd53ca3 -r f8d523976ed6 source/encoder/bitcost.cpp
> --- a/source/encoder/bitcost.cpp Fri Nov 25 12:57:52 2016 +0530
> +++ b/source/encoder/bitcost.cpp Mon Nov 28 11:35:49 2016 +0530
> @@ -54,7 +54,22 @@
>                  s_costs[qp][i] = s_costs[qp][-i] =
> (uint16_t)X265_MIN(s_bitsizes[i] * lambda + 0.5f, (1 << 15) - 1);
>          }
>      }
> +    for (int j = 0; j < 4; j++)
> +    {
> +         if (!s_fpelMvCosts[qp][j])
> +        {
> +            s_fpelMvCosts[qp][j] = X265_MALLOC(uint16_t, BC_MAX_MV + 1) +
> (BC_MAX_MV >> 1);
> +        }
> +    }
>
> +    for (int j = 0; j < 4; j++)
> +    {
> +        for (int i = -(BC_MAX_MV >> 1); i < (BC_MAX_MV >> 1); i++)
> +        {
> +            s_fpelMvCosts[qp][j][i] = s_costs[qp][i * 4 + j];
> +        }
> +        m_fpelMvCosts[j] = s_fpelMvCosts[qp][j];
> +    }
>      m_cost = s_costs[qp];
>  }
>
> @@ -64,6 +79,8 @@
>
>  uint16_t *BitCost::s_costs[BC_MAX_QP];
>
> +uint16_t* BitCost::s_fpelMvCosts[BC_MAX_QP][4];
> +
>  float *BitCost::s_bitsizes;
>
>  Lock BitCost::s_costCalcLock;
> @@ -97,6 +114,17 @@
>          }
>      }
>
> +    for (int i = 0; i < BC_MAX_QP; i++)
> +    {
> +        if (s_fpelMvCosts[i][0])
> +        {
> +            for (int j = 0; j < 4; j++)
> +            {
> +                X265_FREE(s_fpelMvCosts[i][j] - (BC_MAX_MV >> 1));
> +            }
> +        }
> +    }
> +
>      if (s_bitsizes)
>      {
>          X265_FREE(s_bitsizes - 2 * BC_MAX_MV);
> diff -r 5d95fbd53ca3 -r f8d523976ed6 source/encoder/bitcost.h
> --- a/source/encoder/bitcost.h Fri Nov 25 12:57:52 2016 +0530
> +++ b/source/encoder/bitcost.h Mon Nov 28 11:35:49 2016 +0530
> @@ -67,6 +67,8 @@
>
>      uint16_t *m_cost;
>
> +    uint16_t *m_fpelMvCosts[4];
> +
>      MV        m_mvp;
>
>      BitCost& operator =(const BitCost&);
> @@ -84,6 +86,8 @@
>
>      static uint16_t *s_costs[BC_MAX_QP];
>
> +    static uint16_t *s_fpelMvCosts[BC_MAX_QP][4];
> +
>      static Lock s_costCalcLock;
>
>      static void CalculateLogs();
> diff -r 5d95fbd53ca3 -r f8d523976ed6 source/encoder/dpb.cpp
> --- a/source/encoder/dpb.cpp Fri Nov 25 12:57:52 2016 +0530
> +++ b/source/encoder/dpb.cpp Mon Nov 28 11:35:49 2016 +0530
> @@ -92,6 +92,19 @@
>              m_freeList.pushBack(*curFrame);
>              curFrame->m_encData->m_freeListNext = m_frameDataFreeList;
>              m_frameDataFreeList = curFrame->m_encData;
> +
> +            if (curFrame->m_encData->m_meBuffer)
> +            {
> +                for (int i = 0; i < INTEGRAL_PLANE_NUM; i++)
> +                {
> +                    if (curFrame->m_encData->m_meBuffer[i] != NULL)
> +                    {
> +                        X265_FREE(curFrame->m_encData->m_meBuffer[i]);
> +                        curFrame->m_encData->m_meBuffer[i] = NULL;
> +                    }
> +                }
> +            }
> +
>              curFrame->m_encData = NULL;
>              curFrame->m_reconPic = NULL;
>          }
> diff -r 5d95fbd53ca3 -r f8d523976ed6 source/encoder/encoder.cpp
> --- a/source/encoder/encoder.cpp Fri Nov 25 12:57:52 2016 +0530
> +++ b/source/encoder/encoder.cpp Mon Nov 28 11:35:49 2016 +0530
> @@ -869,6 +869,25 @@
>                  slice->m_endCUAddr = slice->realEndAddress(m_sps.numCUsInFrame
> * NUM_4x4_PARTITIONS);
>              }
>
> +            if (m_param->searchMethod == X265_SEA &&
> frameEnc->m_lowres.sliceType != X265_TYPE_B)
> +            {
> +                int padX = g_maxCUSize + 32;
> +                int padY = g_maxCUSize + 16;
> +                uint32_t numCuInHeight = (frameEnc->m_encData->m_reconPic->m_picHeight
> + g_maxCUSize - 1) / g_maxCUSize;
> +                int maxHeight = numCuInHeight * g_maxCUSize;
> +                for (int i = 0; i < INTEGRAL_PLANE_NUM; i++)
> +                {
> +                    frameEnc->m_encData->m_meBuffer[i] =
> X265_MALLOC(uint32_t, frameEnc->m_reconPic->m_stride * (maxHeight + (2 *
> padY)));
> +                    if (frameEnc->m_encData->m_meBuffer[i])
> +                    {
> +                        memset(frameEnc->m_encData->m_meBuffer[i], 0,
> sizeof(uint32_t)* frameEnc->m_reconPic->m_stride * (maxHeight + (2 *
> padY)));
> +                        frameEnc->m_encData->m_meIntegral[i] =
> frameEnc->m_encData->m_meBuffer[i] + frameEnc->m_encData->m_reconPic->m_stride
> * padY + padX;
> +                    }
> +                    else
> +                        x265_log(m_param, X265_LOG_ERROR, "SEA motion
> search: POC %d Integral buffer[%d] unallocated\n", frameEnc->m_poc, i);
> +                }
> +            }
> +
>              if (m_param->bOptQpPPS && frameEnc->m_lowres.bKeyframe &&
> m_param->bRepeatHeaders)
>              {
>                  ScopedLock qpLock(m_sliceQpLock);
> diff -r 5d95fbd53ca3 -r f8d523976ed6 source/encoder/framefilter.cpp
> --- a/source/encoder/framefilter.cpp Fri Nov 25 12:57:52 2016 +0530
> +++ b/source/encoder/framefilter.cpp Mon Nov 28 11:35:49 2016 +0530
> @@ -35,6 +35,109 @@
>  static uint64_t computeSSD(pixel *fenc, pixel *rec, intptr_t stride,
> uint32_t width, uint32_t height);
>  static float calculateSSIM(pixel *pix1, intptr_t stride1, pixel *pix2,
> intptr_t stride2, uint32_t width, uint32_t height, void *buf, uint32_t&
> cnt);
>
> +static void integral_init4h(uint32_t *sum, pixel *pix, intptr_t stride)
> +{
> +    int32_t v = pix[0] + pix[1] + pix[2] + pix[3];
> +    for (int16_t x = 0; x < stride - 4; x++)
> +    {
> +        sum[x] = v + sum[x - stride];
> +        v += pix[x + 4] - pix[x];
> +    }
> +}
> +
> +static void integral_init8h(uint32_t *sum, pixel *pix, intptr_t stride)
> +{
> +    int32_t v = pix[0] + pix[1] + pix[2] + pix[3] + pix[4] + pix[5] +
> pix[6] + pix[7];
> +    for (int16_t x = 0; x < stride - 8; x++)
> +    {
> +        sum[x] = v + sum[x - stride];
> +        v += pix[x + 8] - pix[x];
> +    }
> +}
> +
> +static void integral_init12h(uint32_t *sum, pixel *pix, intptr_t stride)
> +{
> +    int32_t v = pix[0] + pix[1] + pix[2] + pix[3] + pix[4] + pix[5] +
> pix[6] + pix[7] +
> +        pix[8] + pix[9] + pix[10] + pix[11];
> +    for (int16_t x = 0; x < stride - 12; x++)
> +    {
> +        sum[x] = v + sum[x - stride];
> +        v += pix[x + 12] - pix[x];
> +    }
> +}
> +
> +static void integral_init16h(uint32_t *sum, pixel *pix, intptr_t stride)
> +{
> +    int32_t v = pix[0] + pix[1] + pix[2] + pix[3] + pix[4] + pix[5] +
> pix[6] + pix[7] +
> +        pix[8] + pix[9] + pix[10] + pix[11] + pix[12] + pix[13] + pix[14]
> + pix[15];
> +    for (int16_t x = 0; x < stride - 16; x++)
> +    {
> +        sum[x] = v + sum[x - stride];
> +        v += pix[x + 16] - pix[x];
> +    }
> +}
> +
> +static void integral_init24h(uint32_t *sum, pixel *pix, intptr_t stride)
> +{
> +    int32_t v = pix[0] + pix[1] + pix[2] + pix[3] + pix[4] + pix[5] +
> pix[6] + pix[7] +
> +        pix[8] + pix[9] + pix[10] + pix[11] + pix[12] + pix[13] + pix[14]
> + pix[15] +
> +        pix[16] + pix[17] + pix[18] + pix[19] + pix[20] + pix[21] +
> pix[22] + pix[23];
> +    for (int16_t x = 0; x < stride - 24; x++)
> +    {
> +        sum[x] = v + sum[x - stride];
> +        v += pix[x + 24] - pix[x];
> +    }
> +}
> +
> +static void integral_init32h(uint32_t *sum, pixel *pix, intptr_t stride)
> +{
> +    int32_t v = pix[0] + pix[1] + pix[2] + pix[3] + pix[4] + pix[5] +
> pix[6] + pix[7] +
> +        pix[8] + pix[9] + pix[10] + pix[11] + pix[12] + pix[13] + pix[14]
> + pix[15] +
> +        pix[16] + pix[17] + pix[18] + pix[19] + pix[20] + pix[21] +
> pix[22] + pix[23] +
> +        pix[24] + pix[25] + pix[26] + pix[27] + pix[28] + pix[29] +
> pix[30] + pix[31];
> +    for (int16_t x = 0; x < stride - 32; x++)
> +    {
> +        sum[x] = v + sum[x - stride];
> +        v += pix[x + 32] - pix[x];
> +    }
> +}
> +
> +static void integral_init4v(uint32_t *sum4, intptr_t stride)
> +{
> +    for (int x = 0; x < stride; x++)
> +        sum4[x] = sum4[x + 4 * stride] - sum4[x];
> +}
> +
> +static void integral_init8v(uint32_t *sum8, intptr_t stride)
> +{
> +    for (int x = 0; x < stride; x++)
> +        sum8[x] = sum8[x + 8 * stride] - sum8[x];
> +}
> +
> +static void integral_init12v(uint32_t *sum12, intptr_t stride)
> +{
> +    for (int x = 0; x < stride; x++)
> +        sum12[x] = sum12[x + 12 * stride] - sum12[x];
> +}
> +
> +static void integral_init16v(uint32_t *sum16, intptr_t stride)
> +{
> +    for (int x = 0; x < stride; x++)
> +        sum16[x] = sum16[x + 16 * stride] - sum16[x];
> +}
> +
> +static void integral_init24v(uint32_t *sum24, intptr_t stride)
> +{
> +    for (int x = 0; x < stride; x++)
> +        sum24[x] = sum24[x + 24 * stride] - sum24[x];
> +}
> +
> +static void integral_init32v(uint32_t *sum32, intptr_t stride)
> +{
> +    for (int x = 0; x < stride; x++)
> +        sum32[x] = sum32[x + 32 * stride] - sum32[x];
> +}
> +
>  void FrameFilter::destroy()
>  {
>      X265_FREE(m_ssimBuf);
> @@ -65,6 +168,7 @@
>      m_saoRowDelay = m_param->bEnableLoopFilter ? 1 : 0;
>      m_lastHeight = (m_param->sourceHeight % g_maxCUSize) ?
> (m_param->sourceHeight % g_maxCUSize) : g_maxCUSize;
>      m_lastWidth = (m_param->sourceWidth % g_maxCUSize) ?
> (m_param->sourceWidth % g_maxCUSize) : g_maxCUSize;
> +    integralCompleted.set(0);
>
>      if (m_param->bEnableSsim)
>          m_ssimBuf = X265_MALLOC(int, 8 * (m_param->sourceWidth / 4 + 3));
> @@ -664,6 +768,107 @@
>          }
>      } // end of (m_param->maxSlices == 1)
>
> +    int lastRow = row == (int)m_frame->m_encData->m_slice->m_sps->numCuInHeight
> - 1;
> +
> +    /* generate integral planes for SEA motion search */
> +    if (m_param->searchMethod == X265_SEA && m_frame->m_encData->m_meIntegral
> && m_frame->m_lowres.sliceType != X265_TYPE_B)
> +    {
> +        /* If WPP, other than first row, integral calculation for current
> row needs to wait till the
> +        * integral for the previous row is computed */
> +        if (m_param->bEnableWavefront && row)
> +        {
> +            while (m_parallelFilter[row - 1].m_frameFilter->integralCompleted.get()
> == 0)
> +            {
> +                m_parallelFilter[row - 1].m_frameFilter->
> integralCompleted.waitForChange(0);
> +            }
> +        }
> +
> +        int stride = (int)m_frame->m_reconPic->m_stride;
> +        int padX = g_maxCUSize + 32;
> +        int padY = g_maxCUSize + 16;
> +        int numCuInHeight = m_frame->m_encData->m_slice->
> m_sps->numCuInHeight;
> +        int maxHeight = numCuInHeight * g_maxCUSize;
> +        int start = 0;
> +
> +        if (m_param->interlaceMode)
> +            start = (row * g_maxCUSize >> 1);
> +        else
> +            start = row * g_maxCUSize;
> +
> +        int height = lastRow ? (maxHeight + g_maxCUSize *
> m_param->interlaceMode) : (((row + m_param->interlaceMode) * g_maxCUSize) +
> g_maxCUSize);
> +
> +        if (!row)
> +        {
> +            for (int i = 0; i < INTEGRAL_PLANE_NUM; i++)
> +                memset(m_frame->m_encData->m_meIntegral[i] - padY *
> stride - padX, 0, stride * sizeof(uint32_t));
> +            start = -padY;
> +        }
> +
> +        if (lastRow)
> +            height += padY - 1;
> +
> +        for (int y = start; y < height; y++)
> +        {
> +            pixel    *pix = m_frame->m_reconPic->m_picOrg[0] + y *
> stride - padX;
> +            uint32_t *sum32x32 = m_frame->m_encData->m_meIntegral[0] +
> (y + 1) * stride - padX;
> +            uint32_t *sum32x24 = m_frame->m_encData->m_meIntegral[1] +
> (y + 1) * stride - padX;
> +            uint32_t *sum32x8 = m_frame->m_encData->m_meIntegral[2] + (y
> + 1) * stride - padX;
> +            uint32_t *sum24x32 = m_frame->m_encData->m_meIntegral[3] +
> (y + 1) * stride - padX;
> +            uint32_t *sum16x16 = m_frame->m_encData->m_meIntegral[4] +
> (y + 1) * stride - padX;
> +            uint32_t *sum16x12 = m_frame->m_encData->m_meIntegral[5] +
> (y + 1) * stride - padX;
> +            uint32_t *sum16x4 = m_frame->m_encData->m_meIntegral[6] + (y
> + 1) * stride - padX;
> +            uint32_t *sum12x16 = m_frame->m_encData->m_meIntegral[7] +
> (y + 1) * stride - padX;
> +            uint32_t *sum8x32 = m_frame->m_encData->m_meIntegral[8] + (y
> + 1) * stride - padX;
> +            uint32_t *sum8x8 = m_frame->m_encData->m_meIntegral[9] + (y
> + 1) * stride - padX;
> +            uint32_t *sum4x16 = m_frame->m_encData->m_meIntegral[10] +
> (y + 1) * stride - padX;
> +            uint32_t *sum4x4 = m_frame->m_encData->m_meIntegral[11] + (y
> + 1) * stride - padX;
> +
> +            /*For width = 32 */
> +            integral_init32h(sum32x32, pix, stride);
> +            if (y >= 32 - padY)
> +                integral_init32v(sum32x32 - 32 * stride, stride);
> +            integral_init32h(sum32x24, pix, stride);
> +            if (y >= 24 - padY)
> +                integral_init24v(sum32x24 - 24 * stride, stride);
> +            integral_init32h(sum32x8, pix, stride);
> +            if (y >= 8 - padY)
> +                integral_init8v(sum32x8 - 8 * stride, stride);
> +            /*For width = 24 */
> +            integral_init24h(sum24x32, pix, stride);
> +            if (y >= 32 - padY)
> +                integral_init32v(sum24x32 - 32 * stride, stride);
> +            /*For width = 16 */
> +            integral_init16h(sum16x16, pix, stride);
> +            if (y >= 16 - padY)
> +                integral_init16v(sum16x16 - 16 * stride, stride);
> +            integral_init16h(sum16x12, pix, stride);
> +            if (y >= 12 - padY)
> +                integral_init12v(sum16x12 - 12 * stride, stride);
> +            integral_init16h(sum16x4, pix, stride);
> +            if (y >= 4 - padY)
> +                integral_init4v(sum16x4 - 4 * stride, stride);
> +            /*For width = 12 */
> +            integral_init12h(sum12x16, pix, stride);
> +            if (y >= 16 - padY)
> +                integral_init16v(sum12x16 - 16 * stride, stride);
> +            /*For width = 8 */
> +            integral_init8h(sum8x32, pix, stride);
> +            if (y >= 32 - padY)
> +                integral_init32v(sum8x32 - 32 * stride, stride);
> +            integral_init8h(sum8x8, pix, stride);
> +            if (y >= 8 - padY)
> +                integral_init8v(sum8x8 - 8 * stride, stride);
> +            /*For width = 4 */
> +            integral_init4h(sum4x16, pix, stride);
> +            if (y >= 16 - padY)
> +                integral_init16v(sum4x16 - 16 * stride, stride);
> +            integral_init4h(sum4x4, pix, stride);
> +            if (y >= 4 - padY)
> +                integral_init4v(sum4x4 - 4 * stride, stride);
> +        }
> +        m_parallelFilter[row].m_frameFilter->integralCompleted.set(1);
> +    }
> +
>      if (ATOMIC_INC(&m_frameEncoder->m_completionCount) == 2 *
> (int)m_frameEncoder->m_numRows)
>      {
>          m_frameEncoder->m_completionEvent.trigger();
> diff -r 5d95fbd53ca3 -r f8d523976ed6 source/encoder/framefilter.h
> --- a/source/encoder/framefilter.h Fri Nov 25 12:57:52 2016 +0530
> +++ b/source/encoder/framefilter.h Mon Nov 28 11:35:49 2016 +0530
> @@ -57,6 +57,8 @@
>      int           m_lastHeight;
>      int           m_lastWidth;
>
> +    ThreadSafeInteger integralCompleted;     /* check if integral
> calculation is completed in this row */
> +
>      void*         m_ssimBuf;        /* Temp storage for ssim computation
> */
>
>  #define MAX_PFILTER_CUS     (4) /* maximum CUs for every thread */
> diff -r 5d95fbd53ca3 -r f8d523976ed6 source/encoder/motion.cpp
> --- a/source/encoder/motion.cpp Fri Nov 25 12:57:52 2016 +0530
> +++ b/source/encoder/motion.cpp Mon Nov 28 11:35:49 2016 +0530
> @@ -109,6 +109,8 @@
>      blockOffset = 0;
>      bChromaSATD = false;
>      chromaSatd = NULL;
> +    for (int i = 0; i < INTEGRAL_PLANE_NUM; i++)
> +        integral[i] = NULL;
>  }
>
>  void MotionEstimate::init(int csp)
> @@ -165,10 +167,12 @@
>      partEnum = partitionFromSizes(pwidth, pheight);
>      X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n");
>      sad = primitives.pu[partEnum].sad;
> +    ads = primitives.pu[partEnum].ads;
>      satd = primitives.pu[partEnum].satd;
>      sad_x3 = primitives.pu[partEnum].sad_x3;
>      sad_x4 = primitives.pu[partEnum].sad_x4;
>
> +
>      blockwidth = pwidth;
>      blockOffset = offset;
>      absPartIdx = ctuAddr = -1;
> @@ -188,6 +192,7 @@
>      partEnum = partitionFromSizes(pwidth, pheight);
>      X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n");
>      sad = primitives.pu[partEnum].sad;
> +    ads = primitives.pu[partEnum].ads;
>      satd = primitives.pu[partEnum].satd;
>      sad_x3 = primitives.pu[partEnum].sad_x3;
>      sad_x4 = primitives.pu[partEnum].sad_x4;
> @@ -288,6 +293,21 @@
>              COPY2_IF_LT(bcost, costs[3], bmv, omv + MV(m3x, m3y)); \
>      }
>
> +#define COST_MV_X3_ABS( m0x, m0y, m1x, m1y, m2x, m2y )\
> +{\
> +    sad_x3(fenc, \
> +    fref + (m0x) + (m0y) * stride, \
> +    fref + (m1x) + (m1y) * stride, \
> +    fref + (m2x) + (m2y) * stride, \
> +    stride, costs); \
> +    costs[0] += p_cost_mvx[(m0x) << 2]; /* no cost_mvy */\
> +    costs[1] += p_cost_mvx[(m1x) << 2]; \
> +    costs[2] += p_cost_mvx[(m2x) << 2]; \
> +    COPY3_IF_LT(bcost, costs[0], bmv.x, m0x, bmv.y, m0y); \
> +    COPY3_IF_LT(bcost, costs[1], bmv.x, m1x, bmv.y, m1y); \
> +    COPY3_IF_LT(bcost, costs[2], bmv.x, m2x, bmv.y, m2y); \
> +}
> +
>  #define COST_MV_X4_DIR(m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y, costs) \
>      { \
>          pixel *pix_base = fref + bmv.x + bmv.y * stride; \
> @@ -1078,6 +1098,161 @@
>          break;
>      }
>
> +    case X265_SEA:
> +    {
> +        // Successive Elimination Algorithm
> +        const int16_t minX = X265_MAX(omv.x - (int16_t)merange, mvmin.x);
> +        const int16_t minY = X265_MAX(omv.y - (int16_t)merange, mvmin.y);
> +        const int16_t maxX = X265_MIN(omv.x + (int16_t)merange, mvmax.x);
> +        const int16_t maxY = X265_MIN(omv.y + (int16_t)merange, mvmax.y);
> +        const uint16_t *p_cost_mvx = m_cost_mvx - qmvp.x;
> +        const uint16_t *p_cost_mvy = m_cost_mvy - qmvp.y;
> +        int16_t* meScratchBuffer = NULL;
> +        int scratchSize = merange * 2 + 4;
> +        if (scratchSize)
> +        {
> +            meScratchBuffer = X265_MALLOC(int16_t, scratchSize);
> +            memset(meScratchBuffer, 0, sizeof(int16_t)* scratchSize);
> +        }
> +
> +        /* SEA is fastest in multiples of 4 */
> +        int meRangeWidth = (maxX - minX + 3) & ~3;
> +        int w = 0, h = 0;                    // Width and height of the PU
> +        ALIGN_VAR_32(pixel, zero[64 * FENC_STRIDE]) = { 0 };
> +        ALIGN_VAR_32(int, encDC[4]);
> +        uint16_t *fpelCostMvX = m_fpelMvCosts[-qmvp.x & 3] + (-qmvp.x >>
> 2);
> +        sizesFromPartition(partEnum, &w, &h);
> +        int deltaX = (w <= 8) ? (w) : (w >> 1);
> +        int deltaY = (h <= 8) ? (h) : (h >> 1);
> +
> +        /* Check if very small rectangular blocks which cannot be
> sub-divided anymore */
> +        bool smallRectPartition = partEnum == LUMA_4x4 || partEnum ==
> LUMA_16x12 ||
> +            partEnum == LUMA_12x16 || partEnum == LUMA_16x4 || partEnum
> == LUMA_4x16;
> +        /* Check if vertical partition */
> +        bool verticalRect = partEnum == LUMA_32x64 || partEnum ==
> LUMA_16x32 || partEnum == LUMA_8x16 ||
> +            partEnum == LUMA_4x8;
> +        /* Check if horizontal partition */
> +        bool horizontalRect = partEnum == LUMA_64x32 || partEnum ==
> LUMA_32x16 || partEnum == LUMA_16x8 ||
> +            partEnum == LUMA_8x4;
> +        /* Check if assymetric vertical partition */
> +        bool assymetricVertical = partEnum == LUMA_12x16 || partEnum ==
> LUMA_4x16 || partEnum == LUMA_24x32 ||
> +            partEnum == LUMA_8x32 || partEnum == LUMA_48x64 || partEnum
> == LUMA_16x64;
> +        /* Check if assymetric horizontal partition */
> +        bool assymetricHorizontal = partEnum == LUMA_16x12 || partEnum ==
> LUMA_16x4 || partEnum == LUMA_32x24 ||
> +            partEnum == LUMA_32x8 || partEnum == LUMA_64x48 || partEnum
> == LUMA_64x16;
> +
> +        int tempPartEnum = 0;
> +
> +        /* If a vertical rectangular partition, it is horizontally split
> into two, for ads_x2() */
> +        if (verticalRect)
> +            tempPartEnum = partitionFromSizes(w, h >> 1);
> +        /* If a horizontal rectangular partition, it is vertically split
> into two, for ads_x2() */
> +        else if (horizontalRect)
> +            tempPartEnum = partitionFromSizes(w >> 1, h);
> +        /* We have integral planes introduced to account for assymetric
> partitions.
> +         * Hence all assymetric partitions except those which cannot be
> split into legal sizes,
> +         * are split into four for ads_x4() */
> +        else if (assymetricVertical || assymetricHorizontal)
> +            tempPartEnum = smallRectPartition ? partEnum :
> partitionFromSizes(w >> 1, h >> 1);
> +        /* General case: Square partitions. All partitions with width > 8
> are split into four
> +         * for ads_x4(), for 4x4 and 8x8 we do ads_x1() */
> +        else
> +            tempPartEnum = (w <= 8) ? partEnum : partitionFromSizes(w >>
> 1, h >> 1);
> +
> +        /* Successive elimination by comparing DC before a full SAD,
> +         * because sum(abs(diff)) >= abs(diff(sum)). */
> +        primitives.pu[tempPartEnum].sad_x4(zero,
> +                         fenc,
> +                         fenc + deltaX,
> +                         fenc + deltaY * FENC_STRIDE,
> +                         fenc + deltaX + deltaY * FENC_STRIDE,
> +                         FENC_STRIDE,
> +                         encDC);
> +
> +        /* Assigning appropriate integral plane */
> +        uint32_t *sumsBase = NULL;
> +        switch (deltaX)
> +        {
> +            case 32: if (deltaY % 24 == 0)
> +                         sumsBase = integral[1];
> +                     else if (deltaY == 8)
> +                         sumsBase = integral[2];
> +                     else
> +                         sumsBase = integral[0];
> +               break;
> +            case 24: sumsBase = integral[3];
> +               break;
> +            case 16: if (deltaY % 12 == 0)
> +                         sumsBase = integral[5];
> +                     else if (deltaY == 4)
> +                         sumsBase = integral[6];
> +                     else
> +                         sumsBase = integral[4];
> +               break;
> +            case 12: sumsBase = integral[7];
> +                break;
> +            case 8: if (deltaY == 32)
> +                        sumsBase = integral[8];
> +                    else
> +                        sumsBase = integral[9];
> +                break;
> +            case 4: if (deltaY == 16)
> +                        sumsBase = integral[10];
> +                    else
> +                        sumsBase = integral[11];
> +                break;
> +            default: sumsBase = integral[11];
> +                break;
> +        }
> +
> +        if (partEnum == LUMA_64x64 || partEnum == LUMA_32x32 || partEnum
> == LUMA_16x16 ||
> +            partEnum == LUMA_32x64 || partEnum == LUMA_16x32 || partEnum
> == LUMA_8x16 ||
> +            partEnum == LUMA_4x8 || partEnum == LUMA_12x16 || partEnum ==
> LUMA_4x16 ||
> +            partEnum == LUMA_24x32 || partEnum == LUMA_8x32 || partEnum
> == LUMA_48x64 ||
> +            partEnum == LUMA_16x64)
> +            deltaY *= (int)stride;
> +
> +        if (verticalRect)
> +            encDC[1] = encDC[2];
> +
> +        if (horizontalRect)
> +            deltaY = deltaX;
> +
> +        /* ADS and SAD */
> +        MV tmv;
> +        for (tmv.y = minY; tmv.y <= maxY; tmv.y++)
> +        {
> +            int i, xn;
> +            int ycost = p_cost_mvy[tmv.y] << 2;
> +            if (bcost <= ycost)
> +                continue;
> +            bcost -= ycost;
> +
> +            /* ADS_4 for 16x16, 32x32, 64x64, 24x32, 32x24, 48x64, 64x48,
> 32x8, 8x32, 64x16, 16x64 partitions
> +             * ADS_1 for 4x4, 8x8, 16x4, 4x16, 16x12, 12x16 partitions
> +             * ADS_2 for all other rectangular partitions */
> +            xn = ads(encDC,
> +                    sumsBase + minX + tmv.y * stride,
> +                    deltaY,
> +                    fpelCostMvX + minX,
> +                    meScratchBuffer,
> +                    meRangeWidth,
> +                    bcost);
> +
> +            for (i = 0; i < xn - 2; i += 3)
> +                COST_MV_X3_ABS(minX + meScratchBuffer[i], tmv.y,
> +                             minX + meScratchBuffer[i + 1], tmv.y,
> +                             minX + meScratchBuffer[i + 2], tmv.y);
> +
> +            bcost += ycost;
> +            for (; i < xn; i++)
> +                COST_MV(minX + meScratchBuffer[i], tmv.y);
> +        }
> +        if (meScratchBuffer)
> +            x265_free(meScratchBuffer);
> +        break;
> +    }
> +
>      case X265_FULL_SEARCH:
>      {
>          // dead slow exhaustive search, but at least it uses sad_x4()
> diff -r 5d95fbd53ca3 -r f8d523976ed6 source/encoder/motion.h
> --- a/source/encoder/motion.h Fri Nov 25 12:57:52 2016 +0530
> +++ b/source/encoder/motion.h Mon Nov 28 11:35:49 2016 +0530
> @@ -52,6 +52,7 @@
>      pixelcmp_t sad;
>      pixelcmp_x3_t sad_x3;
>      pixelcmp_x4_t sad_x4;
> +    pixelcmp_ads_t ads;
>      pixelcmp_t satd;
>      pixelcmp_t chromaSatd;
>
> @@ -61,6 +62,7 @@
>
>      static const int COST_MAX = 1 << 28;
>
> +    uint32_t* integral[INTEGRAL_PLANE_NUM];
>      Yuv fencPUYuv;
>      int partEnum;
>      bool bChromaSATD;
> diff -r 5d95fbd53ca3 -r f8d523976ed6 source/encoder/search.cpp
> --- a/source/encoder/search.cpp Fri Nov 25 12:57:52 2016 +0530
> +++ b/source/encoder/search.cpp Mon Nov 28 11:35:49 2016 +0530
> @@ -2243,7 +2243,13 @@
>                          if (lmv.notZero())
>                              mvc[numMvc++] = lmv;
>                      }
> -
> +                    if (m_param->searchMethod == X265_SEA)
> +                    {
> +                        int puX = puIdx & 1;
> +                        int puY = puIdx >> 1;
> +                        for (int planes = 0; planes < INTEGRAL_PLANE_NUM;
> planes++)
> +                            m_me.integral[planes] =
> interMode.fencYuv->m_integral[list][ref][planes] + puX * pu.width + puY *
> pu.height * m_slice->m_refFrameList[list][ref]->m_reconPic->m_stride;
> +                    }
>                      setSearchRange(cu, mvp, m_param->searchRange, mvmin,
> mvmax);
>                      int satdCost = m_me.motionEstimate(&slice->m_mref[list][ref],
> mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv,
>                        m_param->bSourceReferenceEstimation ?
> m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);
> diff -r 5d95fbd53ca3 -r f8d523976ed6 source/test/regression-tests.txt
> --- a/source/test/regression-tests.txt Fri Nov 25 12:57:52 2016 +0530
> +++ b/source/test/regression-tests.txt Mon Nov 28 11:35:49 2016 +0530
> @@ -149,4 +149,7 @@
>  CrowdRun_1920x1080_50_10bit_422.yuv,--preset faster --interlace tff
>  CrowdRun_1920x1080_50_10bit_422.yuv,--preset fast --interlace bff
>
> +#SEA Implementation Test
> +silent_cif_420.y4m,--preset veryslow --me 4
> +big_buck_bunny_360p24.y4m,--preset superfast --me 4
>  # vim: tw=200
> diff -r 5d95fbd53ca3 -r f8d523976ed6 source/x265.h
> --- a/source/x265.h Fri Nov 25 12:57:52 2016 +0530
> +++ b/source/x265.h Mon Nov 28 11:35:49 2016 +0530
> @@ -290,6 +290,7 @@
>      X265_HEX_SEARCH,
>      X265_UMH_SEARCH,
>      X265_STAR_SEARCH,
> +    X265_SEA,
>      X265_FULL_SEARCH
>  } X265_ME_METHODS;
>
> @@ -464,7 +465,7 @@
>  } x265_stats;
>
>  /* String values accepted by x265_param_parse() (and CLI) for various
> parameters */
> -static const char * const x265_motion_est_names[] = { "dia", "hex",
> "umh", "star", "full", 0 };
> +static const char * const x265_motion_est_names[] = { "dia", "hex",
> "umh", "star", "sea", "full", 0 };
>  static const char * const x265_source_csp_names[] = { "i400", "i420",
> "i422", "i444", "nv12", "nv16", 0 };
>  static const char * const x265_video_format_names[] = { "component",
> "pal", "ntsc", "secam", "mac", "undef", 0 };
>  static const char * const x265_fullrange_names[] = { "limited", "full", 0
> };
> @@ -910,9 +911,9 @@
>      /* Limit modes analyzed for each CU using cost metrics from the 4
> sub-CUs */
>      uint32_t limitModes;
>
> -    /* ME search method (DIA, HEX, UMH, STAR, FULL). The search patterns
> +    /* ME search method (DIA, HEX, UMH, STAR, SEA, FULL). The search
> patterns
>       * (methods) are sorted in increasing complexity, with diamond being
> the
> -     * simplest and fastest and full being the slowest.  DIA, HEX, and
> UMH were
> +     * simplest and fastest and full being the slowest.  DIA, HEX, UMH
> and SEA were
>       * adapted from x264 directly. STAR is an adaption of the HEVC
> reference
>       * encoder's three step search, while full is a naive exhaustive
> search. The
>       * default is the star search, it has a good balance of performance
> and
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20161129/123b8da3/attachment-0001.html>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: SEA.patch
Type: application/octet-stream
Size: 40105 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20161129/123b8da3/attachment-0001.obj>