[x265] [PATCH RFC] Calling C primitive for weighted-IPFilter and using x265 ME

Steve Borho steve at borho.org
Thu Jul 18 09:24:01 CEST 2013


On Thu, Jul 18, 2013 at 12:50 AM, <deepthidevaki at multicorewareinc.com>wrote:

> # HG changeset patch
> # User Deepthi Devaki
> # Date 1374126601 -19800
> # Node ID 0b14a9a0468bbe26ff038027d735f5f97a4fafea
> # Parent  031c4c889edc4d2af969e2a717cd62c5a950e61a
> Calling C primitive for weighted-IPFilter and using x265 ME
>
> diff -r 031c4c889edc -r 0b14a9a0468b source/Lib/TLibCommon/TComPicYuv.cpp
> --- a/source/Lib/TLibCommon/TComPicYuv.cpp      Thu Jul 18 00:40:54 2013
> -0500
> +++ b/source/Lib/TLibCommon/TComPicYuv.cpp      Thu Jul 18 11:20:01 2013
> +0530
> @@ -251,6 +251,24 @@
>      m_bIsBorderExtended = true;
>  }
>
> +Void TComPicYuv::extendPicBorder(x265::ThreadPool *pool, wpScalingParam
> *w)
> +{
> +    if (m_bIsBorderExtended)
> +        return;
>

We can't early out anymore here.  We need to make sure there exists a
MotionReference for this weight w.


> +    /* HPEL generation requires luma integer plane to already be extended
> */
> +    xExtendPicCompBorder(getLumaAddr(), getStride(), getWidth(),
> getHeight(), m_lumaMarginX, m_lumaMarginY);
> +
> +    xExtendPicCompBorder(getCbAddr(), getCStride(), getWidth() >> 1,
> getHeight() >> 1, m_chromaMarginX, m_chromaMarginY);
> +    xExtendPicCompBorder(getCrAddr(), getCStride(), getWidth() >> 1,
> getHeight() >> 1, m_chromaMarginX, m_chromaMarginY);
> +
> +    if (m_refList == NULL)
> +        m_refList = new x265::MotionReference(this, pool);
> +    m_refList->generateReferencePlanes(w);
>

Similarly here, m_refList needs to be turned into a real linked list now.
 If there isn't a MotionReference with weight w, we should insert a new
MotionReference to the start of the list and interpolate with weight w.
 The code that frees m_refList will need to be made aware that it is a
list, to avoid leaking memory.


> +    m_bIsBorderExtended = true;
> +}
> +
>  Void TComPicYuv::xExtendPicCompBorder(Pel* recon, Int stride, Int width,
> Int height, Int iMarginX, Int iMarginY)
>  {
>      Int   x, y;
> diff -r 031c4c889edc -r 0b14a9a0468b source/Lib/TLibCommon/TComPicYuv.h
> --- a/source/Lib/TLibCommon/TComPicYuv.h        Thu Jul 18 00:40:54 2013
> -0500
> +++ b/source/Lib/TLibCommon/TComPicYuv.h        Thu Jul 18 11:20:01 2013
> +0530
> @@ -184,6 +184,7 @@
>
>      //  Extend function of picture buffer
>      Void  extendPicBorder(x265::ThreadPool *pool);
> +    Void  extendPicBorder(x265::ThreadPool *pool, wpScalingParam *w);
>
>      //  Dump picture
>      Void  dump(Char* pFileName, Bool bAdd = false);
> diff -r 031c4c889edc -r 0b14a9a0468b source/Lib/TLibCommon/TComSlice.h
> --- a/source/Lib/TLibCommon/TComSlice.h Thu Jul 18 00:40:54 2013 -0500
> +++ b/source/Lib/TLibCommon/TComSlice.h Thu Jul 18 11:20:01 2013 +0530
> @@ -1413,7 +1413,6 @@
>      UInt        m_sliceSegmentBits;
>      Bool        m_bFinalized;
>
> -    wpScalingParam  m_weightPredTable[2][MAX_NUM_REF][3]; //
> [REF_PIC_LIST_0 or REF_PIC_LIST_1][refIdx][0:Y, 1:U, 2:V]
>      wpACDCParam     m_weightACDCParam[3];                 // [0:Y, 1:U,
> 2:V]
>
>      std::vector<UInt> m_tileByteLocation;
> @@ -1431,6 +1430,8 @@
>
>  public:
>
> +    wpScalingParam  m_weightPredTable[2][MAX_NUM_REF][3]; //
> [REF_PIC_LIST_0 or REF_PIC_LIST_1][refIdx][0:Y, 1:U, 2:V]
> +
>      TComSlice();
>      virtual ~TComSlice();
>      Void      initSlice();
> diff -r 031c4c889edc -r 0b14a9a0468b source/Lib/TLibEncoder/TEncSlice.cpp
> --- a/source/Lib/TLibEncoder/TEncSlice.cpp      Thu Jul 18 00:40:54 2013
> -0500
> +++ b/source/Lib/TLibEncoder/TEncSlice.cpp      Thu Jul 18 11:20:01 2013
> +0530
> @@ -424,13 +424,30 @@
>
>      Int numPredDir = slice->isInterP() ? 1 : 2;
>
> -    for (Int refList = 0; refList < numPredDir; refList++)
> +    wpexplicit = (slice->getSliceType() == P_SLICE &&
> slice->getPPS()->getUseWP()) ;
> +
> +    if(wpexplicit)
>      {
> -        RefPicList  picList = (refList ? REF_PIC_LIST_1 : REF_PIC_LIST_0);
> -        for (Int refIdxTemp = 0; refIdxTemp <
> slice->getNumRefIdx(picList); refIdxTemp++)
> +        for (Int refList = 0; refList < numPredDir; refList++)
>          {
> -            // To do: Call the merged IP + weighted frames if weighted
> prediction enabled
> -            slice->getRefPic(picList,
> refIdxTemp)->getPicYuvRec()->extendPicBorder(x265::ThreadPool::getThreadPool());
> +            RefPicList  picList = (refList ? REF_PIC_LIST_1 :
> REF_PIC_LIST_0);
> +            for (Int refIdxTemp = 0; refIdxTemp <
> slice->getNumRefIdx(picList); refIdxTemp++)
> +            {
> +                //Call the merged IP + weighted frames if weighted
> prediction enabled
> +                wpScalingParam *w =
> &(slice->m_weightPredTable[picList][refIdxTemp][0]);
> +                slice->getRefPic(picList,
> refIdxTemp)->getPicYuvRec()->extendPicBorder(x265::ThreadPool::getThreadPool(),
> w);
> +            }
> +        }
> +    }
> +    else
> +    {
> +        for (Int refList = 0; refList < numPredDir; refList++)
> +        {
> +            RefPicList  picList = (refList ? REF_PIC_LIST_1 :
> REF_PIC_LIST_0);
> +            for (Int refIdxTemp = 0; refIdxTemp <
> slice->getNumRefIdx(picList); refIdxTemp++)
> +            {
> +                slice->getRefPic(picList,
> refIdxTemp)->getPicYuvRec()->extendPicBorder(x265::ThreadPool::getThreadPool());
> +            }
>          }
>      }
>

ok


>
> diff -r 031c4c889edc -r 0b14a9a0468b source/common/reference.cpp
> --- a/source/common/reference.cpp       Thu Jul 18 00:40:54 2013 -0500
> +++ b/source/common/reference.cpp       Thu Jul 18 11:20:01 2013 +0530
> @@ -141,6 +141,50 @@
>      xFree(m_intermediateValues);
>  }
>
> +void MotionReference::generateReferencePlanes(wpScalingParam *w)
> +{
> +        PPAScopeEvent(GenerateIntermediates);
> +        m_intermediateValues = (short*)xMalloc(short, 4 * m_intStride *
> (m_reconPic->getHeight() + s_tmpMarginY * 4));
> +
> +        short* intPtrF = m_intermediateValues;
> +        short* intPtrA = m_intermediateValues + 1 * m_intStride *
> (m_reconPic->getHeight() + s_tmpMarginY * 4);
> +        short* intPtrB = m_intermediateValues + 2 * m_intStride *
> (m_reconPic->getHeight() + s_tmpMarginY * 4);
> +        short* intPtrC = m_intermediateValues + 3 * m_intStride *
> (m_reconPic->getHeight() + s_tmpMarginY * 4);
> +
> +        int bufOffset = -(s_tmpMarginY + s_intMarginY) * m_lumaStride -
> (s_tmpMarginX + s_intMarginX);
> +        pixel *srcPtr = (pixel*)m_reconPic->getLumaAddr() + bufOffset;
> +
> +        int weight      = w->inputWeight;
> +        int offset = w->inputOffset * (1 << (g_bitDepth - 8));
> +        int shift  = w->log2WeightDenom;
> +        int round  = (w->log2WeightDenom >= 1) ? (1 <<
> (w->log2WeightDenom - 1)) : (0);
> +
> +
> +        /* This one function call generates the four intermediate (short)
> planes for each
> +         * QPEL offset in the horizontal direction.  At the same time it
> outputs the three
> +         * Y=0 output (padded pixel) planes since they require no
> vertical interpolation */
> +
> +        primitives.filterHwghtd(srcPtr, m_lumaStride,               //
> source buffer
> +                        intPtrF, intPtrA, intPtrB, intPtrC, m_intStride,
> // 4 intermediate HPEL buffers
> +                        m_lumaPlane[1][0] + bufOffset,
> +                        m_lumaPlane[2][0] + bufOffset,
> +                        m_lumaPlane[3][0] + bufOffset, m_lumaStride,
> // 3 (x=n, y=0) output buffers (no V interp)
> +                        m_filterWidth + (2 * s_intMarginX),
>  // filter dimensions with margins
> +                        m_filterHeight + (2 * s_intMarginY),
> +                        m_reconPic->m_lumaMarginX - s_tmpMarginX -
> s_intMarginX, // pixel extension margins
> +                        m_reconPic->m_lumaMarginY - s_tmpMarginY -
> s_intMarginY,
> +                        weight, round, shift, offset);
> +
> +        /* serial path for when no thread pool is present: ALWAYS calling
> serial path temporarily until weightp+thread fix */
>

what happens when it runs in parallel?  Is it just that you need to store
the weights in the MotionReference so it can pass them along to the worker
threads?


> +        for (int i = 0; i < 4; i++)
> +        {
> +            generateReferencePlane(i, weight, round, shift, offset);
> +        }
> +
> +    xFree(m_intermediateValues);
> +}
> +
> +
>  bool MotionReference::findJob()
>  {
>      /* Called by thread pool worker threads */
> @@ -175,3 +219,19 @@
>
>      primitives.filterVmulti(intPtr, m_intStride, dstPtr1, dstPtr2,
> dstPtr3, m_lumaStride, m_filterWidth, m_filterHeight,
> m_reconPic->m_lumaMarginX - s_tmpMarginX, m_reconPic->m_lumaMarginY -
> s_tmpMarginY);
>  }
>

Might as well pass wpScalingParam to this function as well.


> +void MotionReference::generateReferencePlane(int x, int w, int roundw,
> int shiftw, int offsetw)
> +{
> +    PPAScopeEvent(GenerateReferencePlanes);
> +
> +    /* this function will be called by 4 threads, with x = 0, 1, 2, 3 */
> +    short* filteredBlockTmp = m_intermediateValues + x * m_intStride *
> (m_reconPic->getHeight() + s_tmpMarginY * 4);
> +    short* intPtr = filteredBlockTmp + s_intMarginY * m_intStride +
> s_intMarginX;
> +
> +    /* the Y=0 plane was generated during horizontal interpolation */
> +    pixel *dstPtr1 = m_lumaPlane[x][1] - s_tmpMarginY * m_lumaStride -
> s_tmpMarginX;
> +    pixel *dstPtr2 = m_lumaPlane[x][2] - s_tmpMarginY * m_lumaStride -
> s_tmpMarginX;
> +    pixel *dstPtr3 = m_lumaPlane[x][3] - s_tmpMarginY * m_lumaStride -
> s_tmpMarginX;
> +
> +    primitives.filterVwghtd(intPtr, m_intStride, dstPtr1, dstPtr2,
> dstPtr3, m_lumaStride, m_filterWidth, m_filterHeight,
> m_reconPic->m_lumaMarginX - s_tmpMarginX, m_reconPic->m_lumaMarginY -
> s_tmpMarginY, w, roundw, shiftw, offsetw);
> +}
> \ No newline at end of file
> diff -r 031c4c889edc -r 0b14a9a0468b source/common/reference.h
> --- a/source/common/reference.h Thu Jul 18 00:40:54 2013 -0500
> +++ b/source/common/reference.h Thu Jul 18 11:20:01 2013 +0530
> @@ -28,6 +28,8 @@
>  #include "threading.h"
>  #include "threadpool.h"
>
> +#include "TLibCommon/TComSlice.h"
> +
>  class TComPicYuv;
>
>  namespace x265 {
> @@ -42,6 +44,8 @@
>      ~MotionReference();
>
>      void generateReferencePlanes();
> +    void generateReferencePlanes(wpScalingParam *w);
> +
>
>      /* indexed by [hpelx|qpelx][hpely|qpely] */
>      pixel* m_lumaPlane[4][4];
> @@ -54,6 +58,8 @@
>
>      bool findJob();
>      void generateReferencePlane(int idx);
> +    void generateReferencePlane(int x, int w, int roundw, int shiftw, int
> offsetw);
> +
>
>      intptr_t     m_startPad;
>      TComPicYuv  *m_reconPic;
> diff -r 031c4c889edc -r 0b14a9a0468b source/encoder/encoder.cpp
> --- a/source/encoder/encoder.cpp        Thu Jul 18 00:40:54 2013 -0500
> +++ b/source/encoder/encoder.cpp        Thu Jul 18 11:20:01 2013 +0530
> @@ -196,7 +196,7 @@
>      setQP(param->qp);
>
>      //====== Motion search ========
> -    if (param->searchMethod != X265_ORIG_SEARCH &&
> (param->bEnableWeightedPred || param->bEnableWeightedBiPred))
>

Probably too early for this.  Once this works, most of the HM search code
will be removed.


> +    if (param->searchMethod != X265_ORIG_SEARCH &&
> (param->bEnableWeightedBiPred))
>      {
>          x265_log(param, X265_LOG_WARNING, "Weighted prediction only
> supported by HM ME, forcing --me 4\n");
>          param->searchMethod = X265_ORIG_SEARCH;
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> http://mailman.videolan.org/listinfo/x265-devel
>
>


-- 
Steve Borho
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20130718/ce6d5019/attachment.html>


More information about the x265-devel mailing list