[x265] [PATCH] frameencoder: remove second encodeCU() pass over CTUs when SAO is disabled

Mon Sep 8 13:37:16 CEST 2014

On 09/08, Deepthi Nandakumar wrote:
> On Sat, Sep 6, 2014 at 10:08 PM, Steve Borho <steve at borho.org> wrote:
> 
> > # HG changeset patch
> > # User Steve Borho <steve at borho.org>
> > # Date 1409932577 -7200
> > #      Fri Sep 05 17:56:17 2014 +0200
> > # Node ID 07d69bce1760a28be1b1ee1821dfeb3335602422
> > # Parent  795878af39730deb24e2ee0e585c625084bb031b
> > frameencoder: remove second encodeCU() pass over CTUs when SAO is disabled
> >
> > This is a performance optimization, it allows the encoder to generate the
> > final
> > bitstream of each CTU as it is compressed and cache hot.
> >
> > When SAO is enabled, SAO analysis must be performed and coded at the start
> > of
> > the CTU but SAO analysis currently requires surrounding CTUs to be encoded
> > making the second pass unavoidable.
> >
> > diff -r 795878af3973 -r 07d69bce1760 source/encoder/frameencoder.cpp
> > --- a/source/encoder/frameencoder.cpp   Fri Sep 05 16:03:44 2014 +0200
> > +++ b/source/encoder/frameencoder.cpp   Fri Sep 05 17:56:17 2014 +0200
> > @@ -192,16 +192,6 @@
> >          }
> >      }
> >
> > -    uint32_t numSubstreams = m_param->bEnableWavefront ?
> > m_frame->getPicSym()->getFrameHeightInCU() : 1;
> > -    if (!m_outStreams)
> > -    {
> > -        m_outStreams = new Bitstream[numSubstreams];
> > -        m_substreamSizes = X265_MALLOC(uint32_t, numSubstreams);
> > -    }
> > -    else
> > -        for (uint32_t i = 0; i < numSubstreams; i++)
> > -            m_outStreams[i].resetBits();
> > -
> >      /* Get the QP for this frame from rate control. This call may block
> > until
> >       * frames ahead of it in encode order have called rateControlEnd() */
> >      int qp = m_top->m_rateControl->rateControlStart(m_frame, &m_rce,
> > m_top);
> > @@ -214,6 +204,24 @@
> >
> >      m_frameFilter.start(m_frame, m_initSliceContext, qp);
> >
> > +    // reset entropy coders
> > +    m_entropyCoder.load(m_initSliceContext);
> > +    for (int i = 0; i < m_numRows; i++)
> > +        m_rows[i].init(m_initSliceContext);
> > +
> > +    uint32_t numSubstreams = m_param->bEnableWavefront ?
> > m_frame->getPicSym()->getFrameHeightInCU() : 1;
> > +    if (!m_outStreams)
> > +    {
> > +        m_outStreams = new Bitstream[numSubstreams];
> > +        m_substreamSizes = X265_MALLOC(uint32_t, numSubstreams);
> > +        if (!m_param->bEnableSAO)
> > +            for (uint32_t i = 0; i < numSubstreams; i++)
> > +
> > m_rows[i].rdEntropyCoders[0][CI_CURR_BEST].setBitstream(&m_outStreams[i]);
> > +    }
> > +    else
> > +        for (uint32_t i = 0; i < numSubstreams; i++)
> > +            m_outStreams[i].resetBits();
> > +
> >      if (m_frame->m_lowres.bKeyframe)
> >      {
> >          if (m_param->bEmitHRDSEI)
> > @@ -328,7 +336,7 @@
> >      m_entropyCoder.setBitstream(&m_bs);
> >      m_entropyCoder.codeSliceHeader(slice);
> >
> > -    // re-encode each row of CUs for the final time (TODO: get rid of
> > this second pass)
> > +    // finish encode of each CTU row
> >      encodeSlice();
> >
> >      // serialize each row, record final lengths in slice header
> > @@ -409,8 +417,40 @@
> >      const uint32_t widthInLCUs =
> > m_frame->getPicSym()->getFrameWidthInCU();
> >      const uint32_t lastCUAddr = (slice->m_endCUAddr +
> > m_frame->getNumPartInCU() - 1) / m_frame->getNumPartInCU();
> >      const int numSubstreams = m_param->bEnableWavefront ?
> > m_frame->getPicSym()->getFrameHeightInCU() : 1;
> > +
> > +    if (!m_param->bEnableSAO)
> > +    {
> > +        /* terminate each row and collect stats */
> > +        for (uint32_t cuAddr = 0; cuAddr < lastCUAddr; cuAddr++)
> > +        {
> > +            uint32_t col = cuAddr % widthInLCUs;
> > +
> > +            if (m_param->bEnableWavefront && col == widthInLCUs - 1)
> > +            {
> > +                uint32_t lin = cuAddr / widthInLCUs;
> > +                uint32_t subStrm = lin % numSubstreams;
> > +
> > m_rows[subStrm].rdEntropyCoders[0][CI_CURR_BEST].codeTerminatingBit(1);
> > +
> > m_rows[subStrm].rdEntropyCoders[0][CI_CURR_BEST].codeSliceFinish();
> > +                m_outStreams[subStrm].writeByteAlignment();
> > +            }
> > +
> > +            // Collect Frame Stats for 2 pass
> > +            TComDataCU* cu = m_frame->getCU(cuAddr);
> > +            m_frameStats.mvBits += cu->m_mvBits;
> > +            m_frameStats.coeffBits += cu->m_coeffBits;
> > +            m_frameStats.miscBits += cu->m_totalBits - (cu->m_mvBits +
> > cu->m_coeffBits);
> > +        }
> > +        if (!m_param->bEnableWavefront)
> > +        {
> > +
> > m_rows[0].rdEntropyCoders[0][CI_CURR_BEST].codeTerminatingBit(1);
> > +            m_rows[0].rdEntropyCoders[0][CI_CURR_BEST].codeSliceFinish();
> > +            m_outStreams[0].writeByteAlignment();
> > +        }
> > +
> > +        return;
> > +    }
> > +
> >      SAOParam *saoParam = slice->m_pic->getPicSym()->m_saoParam;
> > -
> >      for (uint32_t cuAddr = 0; cuAddr < lastCUAddr; cuAddr++)
> >      {
> >          uint32_t col = cuAddr % widthInLCUs;
> > @@ -487,11 +527,6 @@
> >      PPAScopeEvent(FrameEncoder_compressRows);
> >      Slice* slice = m_frame->m_picSym->m_slice;
> >
> > -    // reset entropy coders
> > -    m_entropyCoder.load(m_initSliceContext);
> > -    for (int i = 0; i < m_numRows; i++)
> > -        m_rows[i].init(m_initSliceContext);
> > -
> >      m_bAllRowsStop = false;
> >      m_vbvResetTriggerRow = -1;
> >
> > @@ -672,15 +707,17 @@
> >          }
> >
> >          if (m_param->bEnableWavefront && col == 0 && row > 0)
> > +        {
> >              // Load SBAC coder context from previous row.
> > +
> > curRow.rdEntropyCoders[0][CI_CURR_BEST].copyState(m_initSliceContext);
> >
> >  curRow.rdEntropyCoders[0][CI_CURR_BEST].loadContexts(m_rows[row -
> > 1].bufferEntropyCoder);
> >
> 
> It's the same thing in encodeSlice as well, but why are we copying State
> from m_initSliceContext, and context from the saved previous row Coder?
> Shouldnt both state and context be copied from the previous row coder?

I don't know the exact reasons for it, but with WPP only the contexts
are copied from the row above. The rest of the state is initialized
uniformly for each row.

-- 
Steve Borho