[x265] [PATCH] fix: support for Transquant Bypass mode

Tue May 13 15:29:04 CEST 2014

On Tue, May 13, 2014 at 7:26 AM,  <ashok at multicorewareinc.com> wrote:
> # HG changeset patch
> # User Ashok Kumar Mishra<ashok at multicorewareinc.com>
> # Date 1399982012 -19800
> #      Tue May 13 17:23:32 2014 +0530
> # Node ID 8ce774039d126d484efe8deffc05c91663497cda
> # Parent  a4d0d5679c28d6523e6b01a55fe548c3140118a6
> fix: support for Transquant Bypass mode
>
> diff -r a4d0d5679c28 -r 8ce774039d12 source/Lib/TLibCommon/TComDataCU.cpp
> --- a/source/Lib/TLibCommon/TComDataCU.cpp      Sun May 11 17:32:37 2014 +0900
> +++ b/source/Lib/TLibCommon/TComDataCU.cpp      Tue May 13 17:23:32 2014 +0530
> @@ -339,7 +339,7 @@
>      m_cuMvField[1].clearMvField();
>  }
>
> -void TComDataCU::initEstData(uint32_t depth)
> +void TComDataCU::initEstData(uint32_t depth, bool bIsLosslessMode)
>  {
>      m_totalCost        = MAX_INT64;
>      m_sa8dCost         = MAX_INT64;
> @@ -359,7 +359,7 @@
>          m_skipFlag[i]   = false;
>          m_partSizes[i] = SIZE_NONE;
>          m_predModes[i] = MODE_NONE;
> -        m_cuTransquantBypass[i] = false;
> +        m_cuTransquantBypass[i] = bIsLosslessMode;
>          m_iPCMFlags[i] = 0;
>          m_bMergeFlags[i] = 0;
>          m_lumaIntraDir[i] = DC_IDX;
> diff -r a4d0d5679c28 -r 8ce774039d12 source/Lib/TLibCommon/TComDataCU.h
> --- a/source/Lib/TLibCommon/TComDataCU.h        Sun May 11 17:32:37 2014 +0900
> +++ b/source/Lib/TLibCommon/TComDataCU.h        Tue May 13 17:23:32 2014 +0530
> @@ -181,7 +181,7 @@
>      void          destroy();
>
>      void          initCU(TComPic* pic, uint32_t cuAddr);
> -    void          initEstData(uint32_t depth);
> +    void          initEstData(uint32_t depth, bool bIsLosslessMode);
>      void          initEstData(uint32_t depth, int qp);
>      void          initSubCU(TComDataCU* cu, uint32_t partUnitIdx, uint32_t depth);
>      void          initSubCU(TComDataCU* cu, uint32_t partUnitIdx, uint32_t depth, int qp);
> diff -r a4d0d5679c28 -r 8ce774039d12 source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp
> --- a/source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp        Sun May 11 17:32:37 2014 +0900
> +++ b/source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp        Tue May 13 17:23:32 2014 +0530
> @@ -1368,10 +1368,10 @@
>      uint32_t height;
>      uint32_t pcmLeftShiftBit;
>      uint32_t x, y;
> +    int hChromaShift = cu->getHorzChromaShift();
> +    int vChromaShift = cu->getVertChromaShift();
>      uint32_t lumaOffset   = absZOrderIdx << cu->getPic()->getLog2UnitSize() * 2;
> -    uint32_t chromaOffset = lumaOffset >> 2;
> -
> -    //uint32_t chromaOffset = lumaOffset >> (m_hChromaShift + m_vChromaShift);
> +    uint32_t chromaOffset = lumaOffset >> (hChromaShift + vChromaShift);
>
>      if (ttText == TEXT_LUMA)
>      {
> @@ -1403,10 +1403,9 @@
>          }
>
>          stride = pcPicYuvRec->getCStride();
> -        //width  = ((g_maxCUSize >> depth) >> m_hChromaShift);
> -        //height = ((g_maxCUSize >> depth) >> m_vhChromaShift);
> -        width  = ((g_maxCUSize >> depth) >> 1);
> -        height = ((g_maxCUSize >> depth) >> 1);
> +        width  = ((g_maxCUSize >> depth) >> hChromaShift);
> +        height = ((g_maxCUSize >> depth) >> vChromaShift);
> +
>          if (cu->isLosslessCoded(absZOrderIdx) && !cu->getIPCMFlag(absZOrderIdx))
>          {
>              pcmLeftShiftBit = 0;
> diff -r a4d0d5679c28 -r 8ce774039d12 source/Lib/TLibEncoder/TEncCu.cpp
> --- a/source/Lib/TLibEncoder/TEncCu.cpp Sun May 11 17:32:37 2014 +0900
> +++ b/source/Lib/TLibEncoder/TEncCu.cpp Tue May 13 17:23:32 2014 +0530
> @@ -557,6 +557,9 @@
>      //PPAScopeEvent(TEncCu_xCompressIntraCU + depth);
>
>      TComPic* pic = outBestCU->getPic();
> +    int minTQ, maxTQ;
> +    bool bIsTQBypassEnable = false;
> +    bool bIsLosslessMode   = false;

white-space

>
>      if (depth == 0)
>      {
> @@ -580,20 +583,38 @@
>                            bpely <= slice->getSPS()->getPicHeightInLumaSamples());
>      }
>
> +    minTQ = maxTQ = 1;
> +    if ((outTempCU->getSlice()->getPPS()->getTransquantBypassEnableFlag()))
> +    {
> +        bIsTQBypassEnable = true; // mark that the first iteration is to cost TQB mode.
> +        minTQ = minTQ - 1;        // increase loop variable range by 1, to allow testing of TQB mode.
> +        if (m_param->bEnableCUTransquantBypass)
> +        {
> +            maxTQ = minTQ;
> +        }
> +    }
> +
>      // We need to split, so don't try these modes.
>      if (bInsidePicture)
>      {
> -        outTempCU->initEstData(depth);
> +        for (int iTQ = minTQ; iTQ <= maxTQ; iTQ++)
> +        {
> +            bIsLosslessMode = bIsTQBypassEnable && (iTQ == minTQ);
>
> -        xCheckRDCostIntra(outBestCU, outTempCU, SIZE_2Nx2N);
> -        outTempCU->initEstData(depth);
> +            outTempCU->initEstData(depth, bIsLosslessMode);
>
> -        if (depth == g_maxCUDepth - g_addCUDepth)
> -        {
> -            if (outTempCU->getCUSize(0) > (1 << slice->getSPS()->getQuadtreeTULog2MinSize()))
> +            xCheckRDCostIntra(outBestCU, outTempCU, SIZE_2Nx2N);
> +
> +            outTempCU->initEstData(depth, bIsLosslessMode);
> +
> +            if (depth == g_maxCUDepth - g_addCUDepth)
>              {
> -                xCheckRDCostIntra(outBestCU, outTempCU, SIZE_NxN);
> +                if (outTempCU->getCUSize(0) > (1 << slice->getSPS()->getQuadtreeTULog2MinSize()))
> +                {
> +                    xCheckRDCostIntra(outBestCU, outTempCU, SIZE_NxN);
> +                }
>              }
> +
>          }
>
>          m_entropyCoder->resetBits();
> @@ -602,7 +623,13 @@
>          outBestCU->m_totalCost  = m_rdCost->calcRdCost(outBestCU->m_totalDistortion, outBestCU->m_totalBits);
>      }
>
> -    outTempCU->initEstData(depth);
> +    // copy original YUV samples to PCM buffer
> +    if (outBestCU->isLosslessCoded(0) && (outBestCU->getIPCMFlag(0) == false))
> +    {
> +        xFillPCMBuffer(outBestCU, m_origYuv[depth]);
> +    }
> +
> +    outTempCU->initEstData(depth, bIsLosslessMode);
>
>      // further split
>      if (depth < g_maxCUDepth - g_addCUDepth)
> @@ -696,6 +723,8 @@
>      //PPAScopeEvent(TEncCu_xCompressCU + depth);
>
>      TComPic* pic = outBestCU->getPic();
> +    int minTQ, maxTQ;
> +    bool bIsTQBypassEnable = false;
>
>      if (depth == 0)
>      {
> @@ -726,196 +755,218 @@
>                            bpely <= slice->getSPS()->getPicHeightInLumaSamples());
>      }
>
> +    minTQ = maxTQ = 1;
> +    if ((outTempCU->getSlice()->getPPS()->getTransquantBypassEnableFlag()))
> +    {
> +        bIsTQBypassEnable = true; // mark that the first iteration is to cost TQB mode.
> +        minTQ = minTQ - 1;        // increase loop variable range by 1, to allow testing of TQB mode along.
> +        if (m_param->bEnableCUTransquantBypass)
> +        {
> +            maxTQ = minTQ;
> +        }
> +    }
> +
>      // We need to split, so don't try these modes.
>      if (bInsidePicture)
>      {
> -        outTempCU->initEstData(depth);
> +        for (int iTQ = minTQ; iTQ <= maxTQ; iTQ++)
> +        {
> +            bool bIsLosslessMode = bIsTQBypassEnable && (iTQ == minTQ);
>
> -        // do inter modes, SKIP and 2Nx2N
> -        if (slice->getSliceType() != I_SLICE)
> -        {
> -            // 2Nx2N
> -            if (m_param->bEnableEarlySkip)
> +            outTempCU->initEstData(depth, bIsLosslessMode);
> +
> +            // do inter modes, SKIP and 2Nx2N
> +            if (slice->getSliceType() != I_SLICE)
>              {
> -                xCheckRDCostInter(outBestCU, outTempCU, SIZE_2Nx2N);
> -                outTempCU->initEstData(depth); // by competition for inter_2Nx2N
> -            }
> -            // by Merge for inter_2Nx2N
> -            xCheckRDCostMerge2Nx2N(outBestCU, outTempCU, &earlyDetectionSkipMode, m_bestPredYuv[depth], m_bestRecoYuv[depth]);
> +                // 2Nx2N
> +                if (m_param->bEnableEarlySkip)
> +                {
> +                    xCheckRDCostInter(outBestCU, outTempCU, SIZE_2Nx2N);
> +                    outTempCU->initEstData(depth, bIsLosslessMode); // by competition for inter_2Nx2N
> +                }
> +                // by Merge for inter_2Nx2N
> +                xCheckRDCostMerge2Nx2N(outBestCU, outTempCU, &earlyDetectionSkipMode, m_bestPredYuv[depth], m_bestRecoYuv[depth]);
>
> -            outTempCU->initEstData(depth);
> +                outTempCU->initEstData(depth, bIsLosslessMode);
>
> -            if (!m_param->bEnableEarlySkip)
> -            {
> -                // 2Nx2N, NxN
> -                xCheckRDCostInter(outBestCU, outTempCU, SIZE_2Nx2N);
> -                outTempCU->initEstData(depth);
> -                if (m_param->bEnableCbfFastMode)
> +                if (!m_param->bEnableEarlySkip)
>                  {
> -                    doNotBlockPu = outBestCU->getQtRootCbf(0) != 0;
> +                    // 2Nx2N, NxN
> +                    xCheckRDCostInter(outBestCU, outTempCU, SIZE_2Nx2N);
> +                    outTempCU->initEstData(depth, bIsLosslessMode);
> +                    if (m_param->bEnableCbfFastMode)
> +                    {
> +                        doNotBlockPu = outBestCU->getQtRootCbf(0) != 0;
> +                    }
>                  }
>              }
>          }
>
>          if (!earlyDetectionSkipMode)
>          {
> -            outTempCU->initEstData(depth);
> +            for (int iTQ = minTQ; iTQ <= maxTQ; iTQ++)
> +            {
> +                bool bIsLosslessMode = bIsTQBypassEnable && (iTQ == minTQ); // If lossless, then iQP is irrelevant for subsequent modules.

Why is a loop necessary?  This smells like a brute force RDO that the
HM would do; but probably doesn't belong in a real encoder

>
> -            // do inter modes, NxN, 2NxN, and Nx2N
> -            if (slice->getSliceType() != I_SLICE)
> -            {
> -                // 2Nx2N, NxN
> -                if (!(outBestCU->getCUSize(0) == 8))
> +                outTempCU->initEstData(depth, bIsLosslessMode);
> +
> +                // do inter modes, NxN, 2NxN, and Nx2N
> +                if (slice->getSliceType() != I_SLICE)
>                  {
> -                    if (depth == g_maxCUDepth - g_addCUDepth && doNotBlockPu)
> +                    // 2Nx2N, NxN
> +                    if (!(outBestCU->getCUSize(0) == 8))
>                      {
> -                        xCheckRDCostInter(outBestCU, outTempCU, SIZE_NxN);
> -                        outTempCU->initEstData(depth);
> -                    }
> -                }
> -
> -                if (m_param->bEnableRectInter)
> -                {
> -                    // 2NxN, Nx2N
> -                    if (doNotBlockPu)
> -                    {
> -                        xCheckRDCostInter(outBestCU, outTempCU, SIZE_Nx2N);
> -                        outTempCU->initEstData(depth);
> -                        if (m_param->bEnableCbfFastMode && outBestCU->getPartitionSize(0) == SIZE_Nx2N)
> +                        if (depth == g_maxCUDepth - g_addCUDepth && doNotBlockPu)
>                          {
> -                            doNotBlockPu = outBestCU->getQtRootCbf(0) != 0;
> +                            xCheckRDCostInter(outBestCU, outTempCU, SIZE_NxN);
> +                            outTempCU->initEstData(depth, bIsLosslessMode);
>                          }
>                      }
> -                    if (doNotBlockPu)
> +
> +                    if (m_param->bEnableRectInter)
>                      {
> -                        xCheckRDCostInter(outBestCU, outTempCU, SIZE_2NxN);
> -                        outTempCU->initEstData(depth);
> -                        if (m_param->bEnableCbfFastMode && outBestCU->getPartitionSize(0) == SIZE_2NxN)
> -                        {
> -                            doNotBlockPu = outBestCU->getQtRootCbf(0) != 0;
> -                        }
> -                    }
> -                }
> -
> -                // Try AMP (SIZE_2NxnU, SIZE_2NxnD, SIZE_nLx2N, SIZE_nRx2N)
> -                if (slice->getSPS()->getAMPAcc(depth))
> -                {
> -                    bool bTestAMP_Hor = false, bTestAMP_Ver = false;
> -                    bool bTestMergeAMP_Hor = false, bTestMergeAMP_Ver = false;
> -
> -                    deriveTestModeAMP(outBestCU, parentSize, bTestAMP_Hor, bTestAMP_Ver, bTestMergeAMP_Hor, bTestMergeAMP_Ver);
> -
> -                    // Do horizontal AMP
> -                    if (bTestAMP_Hor)
> -                    {
> +                        // 2NxN, Nx2N
>                          if (doNotBlockPu)
>                          {
> -                            xCheckRDCostInter(outBestCU, outTempCU, SIZE_2NxnU);
> -                            outTempCU->initEstData(depth);
> -                            if (m_param->bEnableCbfFastMode && outBestCU->getPartitionSize(0) == SIZE_2NxnU)
> +                            xCheckRDCostInter(outBestCU, outTempCU, SIZE_Nx2N);
> +                            outTempCU->initEstData(depth, bIsLosslessMode);
> +                            if (m_param->bEnableCbfFastMode && outBestCU->getPartitionSize(0) == SIZE_Nx2N)
>                              {
>                                  doNotBlockPu = outBestCU->getQtRootCbf(0) != 0;
>                              }
>                          }
>                          if (doNotBlockPu)
>                          {
> -                            xCheckRDCostInter(outBestCU, outTempCU, SIZE_2NxnD);
> -                            outTempCU->initEstData(depth);
> -                            if (m_param->bEnableCbfFastMode && outBestCU->getPartitionSize(0) == SIZE_2NxnD)
> -                            {
> -                                doNotBlockPu = outBestCU->getQtRootCbf(0) != 0;
> -                            }
> -                        }
> -                    }
> -                    else if (bTestMergeAMP_Hor)
> -                    {
> -                        if (doNotBlockPu)
> -                        {
> -                            xCheckRDCostInter(outBestCU, outTempCU, SIZE_2NxnU, true);
> -                            outTempCU->initEstData(depth);
> -                            if (m_param->bEnableCbfFastMode && outBestCU->getPartitionSize(0) == SIZE_2NxnU)
> -                            {
> -                                doNotBlockPu = outBestCU->getQtRootCbf(0) != 0;
> -                            }
> -                        }
> -                        if (doNotBlockPu)
> -                        {
> -                            xCheckRDCostInter(outBestCU, outTempCU, SIZE_2NxnD, true);
> -                            outTempCU->initEstData(depth);
> -                            if (m_param->bEnableCbfFastMode && outBestCU->getPartitionSize(0) == SIZE_2NxnD)
> +                            xCheckRDCostInter(outBestCU, outTempCU, SIZE_2NxN);
> +                            outTempCU->initEstData(depth, bIsLosslessMode);
> +                            if (m_param->bEnableCbfFastMode && outBestCU->getPartitionSize(0) == SIZE_2NxN)
>                              {
>                                  doNotBlockPu = outBestCU->getQtRootCbf(0) != 0;
>                              }
>                          }
>                      }
>
> -                    // Do horizontal AMP
> -                    if (bTestAMP_Ver)
> +                    // Try AMP (SIZE_2NxnU, SIZE_2NxnD, SIZE_nLx2N, SIZE_nRx2N)
> +                    if (slice->getSPS()->getAMPAcc(depth))
>                      {
> -                        if (doNotBlockPu)
> +                        bool bTestAMP_Hor = false, bTestAMP_Ver = false;
> +                        bool bTestMergeAMP_Hor = false, bTestMergeAMP_Ver = false;
> +
> +                        deriveTestModeAMP(outBestCU, parentSize, bTestAMP_Hor, bTestAMP_Ver, bTestMergeAMP_Hor, bTestMergeAMP_Ver);
> +
> +                        // Do horizontal AMP
> +                        if (bTestAMP_Hor)
>                          {
> -                            xCheckRDCostInter(outBestCU, outTempCU, SIZE_nLx2N);
> -                            outTempCU->initEstData(depth);
> -                            if (m_param->bEnableCbfFastMode && outBestCU->getPartitionSize(0) == SIZE_nLx2N)
> +                            if (doNotBlockPu)
>                              {
> -                                doNotBlockPu = outBestCU->getQtRootCbf(0) != 0;
> +                                xCheckRDCostInter(outBestCU, outTempCU, SIZE_2NxnU);
> +                                outTempCU->initEstData(depth, bIsLosslessMode);
> +                                if (m_param->bEnableCbfFastMode && outBestCU->getPartitionSize(0) == SIZE_2NxnU)
> +                                {
> +                                    doNotBlockPu = outBestCU->getQtRootCbf(0) != 0;
> +                                }
> +                            }
> +                            if (doNotBlockPu)
> +                            {
> +                                xCheckRDCostInter(outBestCU, outTempCU, SIZE_2NxnD);
> +                                outTempCU->initEstData(depth, bIsLosslessMode);
> +                                if (m_param->bEnableCbfFastMode && outBestCU->getPartitionSize(0) == SIZE_2NxnD)
> +                                {
> +                                    doNotBlockPu = outBestCU->getQtRootCbf(0) != 0;
> +                                }
>                              }
>                          }
> -                        if (doNotBlockPu)
> +                        else if (bTestMergeAMP_Hor)
>                          {
> -                            xCheckRDCostInter(outBestCU, outTempCU, SIZE_nRx2N);
> -                            outTempCU->initEstData(depth);
> -                        }
> -                    }
> -                    else if (bTestMergeAMP_Ver)
> -                    {
> -                        if (doNotBlockPu)
> -                        {
> -                            xCheckRDCostInter(outBestCU, outTempCU, SIZE_nLx2N, true);
> -                            outTempCU->initEstData(depth);
> -                            if (m_param->bEnableCbfFastMode && outBestCU->getPartitionSize(0) == SIZE_nLx2N)
> +                            if (doNotBlockPu)
>                              {
> -                                doNotBlockPu = outBestCU->getQtRootCbf(0) != 0;
> +                                xCheckRDCostInter(outBestCU, outTempCU, SIZE_2NxnU, true);
> +                                outTempCU->initEstData(depth, bIsLosslessMode);
> +                                if (m_param->bEnableCbfFastMode && outBestCU->getPartitionSize(0) == SIZE_2NxnU)
> +                                {
> +                                    doNotBlockPu = outBestCU->getQtRootCbf(0) != 0;
> +                                }
> +                            }
> +                            if (doNotBlockPu)
> +                            {
> +                                xCheckRDCostInter(outBestCU, outTempCU, SIZE_2NxnD, true);
> +                                outTempCU->initEstData(depth, bIsLosslessMode);
> +                                if (m_param->bEnableCbfFastMode && outBestCU->getPartitionSize(0) == SIZE_2NxnD)
> +                                {
> +                                    doNotBlockPu = outBestCU->getQtRootCbf(0) != 0;
> +                                }
>                              }
>                          }
> -                        if (doNotBlockPu)
> +
> +                        // Do horizontal AMP
> +                        if (bTestAMP_Ver)
>                          {
> -                            xCheckRDCostInter(outBestCU, outTempCU, SIZE_nRx2N, true);
> -                            outTempCU->initEstData(depth);
> +                            if (doNotBlockPu)
> +                            {
> +                                xCheckRDCostInter(outBestCU, outTempCU, SIZE_nLx2N);
> +                                outTempCU->initEstData(depth, bIsLosslessMode);
> +                                if (m_param->bEnableCbfFastMode && outBestCU->getPartitionSize(0) == SIZE_nLx2N)
> +                                {
> +                                    doNotBlockPu = outBestCU->getQtRootCbf(0) != 0;
> +                                }
> +                            }
> +                            if (doNotBlockPu)
> +                            {
> +                                xCheckRDCostInter(outBestCU, outTempCU, SIZE_nRx2N);
> +                                outTempCU->initEstData(depth, bIsLosslessMode);
> +                            }
> +                        }
> +                        else if (bTestMergeAMP_Ver)
> +                        {
> +                            if (doNotBlockPu)
> +                            {
> +                                xCheckRDCostInter(outBestCU, outTempCU, SIZE_nLx2N, true);
> +                                outTempCU->initEstData(depth, bIsLosslessMode);
> +                                if (m_param->bEnableCbfFastMode && outBestCU->getPartitionSize(0) == SIZE_nLx2N)
> +                                {
> +                                    doNotBlockPu = outBestCU->getQtRootCbf(0) != 0;
> +                                }
> +                            }
> +                            if (doNotBlockPu)
> +                            {
> +                                xCheckRDCostInter(outBestCU, outTempCU, SIZE_nRx2N, true);
> +                                outTempCU->initEstData(depth, bIsLosslessMode);
> +                            }
>                          }
>                      }
>                  }
> -            }
>
> -            // do normal intra modes
> -            // speedup for inter frames
> -            if (slice->getSliceType() == I_SLICE ||
> -                outBestCU->getCbf(0, TEXT_LUMA) != 0   ||
> -                outBestCU->getCbf(0, TEXT_CHROMA_U) != 0   ||
> -                outBestCU->getCbf(0, TEXT_CHROMA_V) != 0) // avoid very complex intra if it is unlikely
> -            {
> -                xCheckRDCostIntraInInter(outBestCU, outTempCU, SIZE_2Nx2N);
> -                outTempCU->initEstData(depth);
> +                // do normal intra modes
> +                // speedup for inter frames
> +                if (slice->getSliceType() == I_SLICE ||
> +                    outBestCU->getCbf(0, TEXT_LUMA) != 0   ||
> +                    outBestCU->getCbf(0, TEXT_CHROMA_U) != 0   ||
> +                    outBestCU->getCbf(0, TEXT_CHROMA_V) != 0) // avoid very complex intra if it is unlikely
> +                {
> +                    xCheckRDCostIntraInInter(outBestCU, outTempCU, SIZE_2Nx2N);
> +                    outTempCU->initEstData(depth, bIsLosslessMode);
>
> -                if (depth == g_maxCUDepth - g_addCUDepth)
> -                {
> -                    if (outTempCU->getCUSize(0) > (1 << slice->getSPS()->getQuadtreeTULog2MinSize()))
> +                    if (depth == g_maxCUDepth - g_addCUDepth)
>                      {
> -                        xCheckRDCostIntraInInter(outBestCU, outTempCU, SIZE_NxN);
> -                        outTempCU->initEstData(depth);
> +                        if (outTempCU->getCUSize(0) > (1 << slice->getSPS()->getQuadtreeTULog2MinSize()))
> +                        {
> +                            xCheckRDCostIntraInInter(outBestCU, outTempCU, SIZE_NxN);
> +                            outTempCU->initEstData(depth, bIsLosslessMode);
> +                        }
>                      }
>                  }
> -            }
> -            // test PCM
> -            if (slice->getSPS()->getUsePCM()
> -                && outTempCU->getCUSize(0) <= (1 << slice->getSPS()->getPCMLog2MaxSize())
> -                && outTempCU->getCUSize(0) >= (1 << slice->getSPS()->getPCMLog2MinSize()))
> -            {
> -                uint32_t rawbits = (2 * X265_DEPTH + X265_DEPTH) * outBestCU->getCUSize(0) * outBestCU->getCUSize(0) / 2;
> -                uint32_t bestbits = outBestCU->m_totalBits;
> -                if ((bestbits > rawbits) || (outBestCU->m_totalCost > m_rdCost->calcRdCost(0, rawbits)))
> +                // test PCM
> +                if (slice->getSPS()->getUsePCM()
> +                    && outTempCU->getCUSize(0) <= (1 << slice->getSPS()->getPCMLog2MaxSize())
> +                    && outTempCU->getCUSize(0) >= (1 << slice->getSPS()->getPCMLog2MinSize()))
>                  {
> -                    xCheckIntraPCM(outBestCU, outTempCU);
> +                    uint32_t rawbits = (2 * X265_DEPTH + X265_DEPTH) * outBestCU->getCUSize(0) * outBestCU->getCUSize(0) / 2;
> +                    uint32_t bestbits = outBestCU->m_totalBits;
> +                    if ((bestbits > rawbits) || (outBestCU->m_totalCost > m_rdCost->calcRdCost(0, rawbits)))
> +                    {
> +                        xCheckIntraPCM(outBestCU, outTempCU);
> +                        outTempCU->initEstData(depth, bIsLosslessMode);
> +                    }
>                  }
>              }
>          }
> @@ -938,80 +989,90 @@
>          xFillPCMBuffer(outBestCU, m_origYuv[depth]);
>      }
>
> -    outTempCU->initEstData(depth);
> +    minTQ = maxTQ = 1;
> +    if (m_param->bEnableCUTransquantBypass)
> +    {
> +        maxTQ = minTQ;
> +    }
>
> -    // further split
> -    if (bSubBranch && depth < g_maxCUDepth - g_addCUDepth)
> +    for (int iTQ = minTQ; iTQ <= maxTQ; iTQ++)
>      {
> -        uint8_t     nextDepth     = depth + 1;
> -        TComDataCU* subBestPartCU = m_bestCU[nextDepth];
> -        TComDataCU* subTempPartCU = m_tempCU[nextDepth];
> -        uint32_t partUnitIdx = 0;
> -        for (; partUnitIdx < 4; partUnitIdx++)
> +        bool bIsLosslessMode = false;
> +        outTempCU->initEstData(depth, bIsLosslessMode);
> +
> +        // further split
> +        if (bSubBranch && depth < g_maxCUDepth - g_addCUDepth)
>          {
> -            subBestPartCU->initSubCU(outTempCU, partUnitIdx, nextDepth); // clear sub partition datas or init.
> +            uint8_t     nextDepth     = depth + 1;
> +            TComDataCU* subBestPartCU = m_bestCU[nextDepth];
> +            TComDataCU* subTempPartCU = m_tempCU[nextDepth];
> +            uint32_t partUnitIdx = 0;
> +            for (; partUnitIdx < 4; partUnitIdx++)
> +            {
> +                subBestPartCU->initSubCU(outTempCU, partUnitIdx, nextDepth); // clear sub partition datas or init.
>
> -            if (bInsidePicture ||
> -                ((subBestPartCU->getCUPelX() < slice->getSPS()->getPicWidthInLumaSamples()) &&
> -                 (subBestPartCU->getCUPelY() < slice->getSPS()->getPicHeightInLumaSamples())))
> -            {
> -                subTempPartCU->initSubCU(outTempCU, partUnitIdx, nextDepth); // clear sub partition datas or init.
> -                if (0 == partUnitIdx) //initialize RD with previous depth buffer
> +                if (bInsidePicture ||
> +                    ((subBestPartCU->getCUPelX() < slice->getSPS()->getPicWidthInLumaSamples()) &&
> +                    (subBestPartCU->getCUPelY() < slice->getSPS()->getPicHeightInLumaSamples())))
>                  {
> -                    m_rdSbacCoders[nextDepth][CI_CURR_BEST]->load(m_rdSbacCoders[depth][CI_CURR_BEST]);
> +                    subTempPartCU->initSubCU(outTempCU, partUnitIdx, nextDepth); // clear sub partition datas or init.
> +                    if (0 == partUnitIdx) //initialize RD with previous depth buffer
> +                    {
> +                        m_rdSbacCoders[nextDepth][CI_CURR_BEST]->load(m_rdSbacCoders[depth][CI_CURR_BEST]);
> +                    }
> +                    else
> +                    {
> +                        m_rdSbacCoders[nextDepth][CI_CURR_BEST]->load(m_rdSbacCoders[nextDepth][CI_NEXT_BEST]);
> +                    }
> +
> +                    xCompressCU(subBestPartCU, subTempPartCU, nextDepth, bInsidePicture);
> +                    outTempCU->copyPartFrom(subBestPartCU, partUnitIdx, nextDepth); // Keep best part data to current temporary data.
> +                    xCopyYuv2Tmp(subBestPartCU->getTotalNumPart() * partUnitIdx, nextDepth);
>                  }
>                  else
>                  {
> -                    m_rdSbacCoders[nextDepth][CI_CURR_BEST]->load(m_rdSbacCoders[nextDepth][CI_NEXT_BEST]);
> -                }
> -
> -                xCompressCU(subBestPartCU, subTempPartCU, nextDepth, bInsidePicture);
> -                outTempCU->copyPartFrom(subBestPartCU, partUnitIdx, nextDepth); // Keep best part data to current temporary data.
> -                xCopyYuv2Tmp(subBestPartCU->getTotalNumPart() * partUnitIdx, nextDepth);
> -            }
> -            else
> -            {
> -                subBestPartCU->copyToPic(nextDepth);
> -                outTempCU->copyPartFrom(subBestPartCU, partUnitIdx, nextDepth);
> -            }
> -        }
> -
> -        if (bInsidePicture)
> -        {
> -            m_entropyCoder->resetBits();
> -            m_entropyCoder->encodeSplitFlag(outTempCU, 0, depth);
> -            outTempCU->m_totalBits += m_entropyCoder->getNumberOfWrittenBits(); // split bits
> -        }
> -        outTempCU->m_totalCost = m_rdCost->calcRdCost(outTempCU->m_totalDistortion, outTempCU->m_totalBits);
> -
> -        if ((g_maxCUSize >> depth) == slice->getPPS()->getMinCuDQPSize() && slice->getPPS()->getUseDQP())
> -        {
> -            bool hasResidual = false;
> -            for (uint32_t blkIdx = 0; blkIdx < outTempCU->getTotalNumPart(); blkIdx++)
> -            {
> -                if (outTempCU->getCbf(blkIdx, TEXT_LUMA) || outTempCU->getCbf(blkIdx, TEXT_CHROMA_U) ||
> -                    outTempCU->getCbf(blkIdx, TEXT_CHROMA_V))
> -                {
> -                    hasResidual = true;
> -                    break;
> +                    subBestPartCU->copyToPic(nextDepth);
> +                    outTempCU->copyPartFrom(subBestPartCU, partUnitIdx, nextDepth);
>                  }
>              }
>
> -            uint32_t targetPartIdx = 0;
> -            if (hasResidual)
> +            if (bInsidePicture)
>              {
> -                bool foundNonZeroCbf = false;
> -                outTempCU->setQPSubCUs(outTempCU->getRefQP(targetPartIdx), outTempCU, 0, depth, foundNonZeroCbf);
> -                assert(foundNonZeroCbf);
> +                m_entropyCoder->resetBits();
> +                m_entropyCoder->encodeSplitFlag(outTempCU, 0, depth);
> +                outTempCU->m_totalBits += m_entropyCoder->getNumberOfWrittenBits(); // split bits
>              }
> -            else
> +            outTempCU->m_totalCost = m_rdCost->calcRdCost(outTempCU->m_totalDistortion, outTempCU->m_totalBits);
> +
> +            if ((g_maxCUSize >> depth) == slice->getPPS()->getMinCuDQPSize() && slice->getPPS()->getUseDQP())
>              {
> -                outTempCU->setQPSubParts(outTempCU->getRefQP(targetPartIdx), 0, depth); // set QP to default QP
> +                bool hasResidual = false;
> +                for (uint32_t blkIdx = 0; blkIdx < outTempCU->getTotalNumPart(); blkIdx++)
> +                {
> +                    if (outTempCU->getCbf(blkIdx, TEXT_LUMA) || outTempCU->getCbf(blkIdx, TEXT_CHROMA_U) ||
> +                        outTempCU->getCbf(blkIdx, TEXT_CHROMA_V))
> +                    {
> +                        hasResidual = true;
> +                        break;
> +                    }
> +                }
> +
> +                uint32_t targetPartIdx = 0;
> +                if (hasResidual)
> +                {
> +                    bool foundNonZeroCbf = false;
> +                    outTempCU->setQPSubCUs(outTempCU->getRefQP(targetPartIdx), outTempCU, 0, depth, foundNonZeroCbf);
> +                    assert(foundNonZeroCbf);
> +                }
> +                else
> +                {
> +                    outTempCU->setQPSubParts(outTempCU->getRefQP(targetPartIdx), 0, depth); // set QP to default QP
> +                }
>              }
> +
> +            m_rdSbacCoders[nextDepth][CI_NEXT_BEST]->store(m_rdSbacCoders[depth][CI_TEMP_BEST]);
> +            xCheckBestMode(outBestCU, outTempCU, depth); // RD compare current CU against split
>          }
> -
> -        m_rdSbacCoders[nextDepth][CI_NEXT_BEST]->store(m_rdSbacCoders[depth][CI_TEMP_BEST]);
> -        xCheckBestMode(outBestCU, outTempCU, depth); // RD compare current CU against split
>      }
>      outBestCU->copyToPic(depth); // Copy Best data to Picture for next partition prediction.
>
> @@ -1215,6 +1276,7 @@
>      TComMvField mvFieldNeighbours[MRG_MAX_NUM_CANDS << 1]; // double length for mv of both lists
>      uint8_t interDirNeighbours[MRG_MAX_NUM_CANDS];
>      int numValidMergeCand = 0;
> +    bool bTransquantBypassFlag = outTempCU->getCUTransquantBypass(0);
>
>      for (uint32_t i = 0; i < outTempCU->getSlice()->getMaxNumMergeCand(); ++i)
>      {
> @@ -1223,7 +1285,7 @@
>
>      uint8_t depth = outTempCU->getDepth(0);
>      outTempCU->setPartSizeSubParts(SIZE_2Nx2N, 0, depth); // interprets depth relative to LCU level
> -    outTempCU->setCUTransquantBypassSubParts(m_CUTransquantBypassFlagValue, 0, depth);
> +
>      outTempCU->getInterMergeCandidates(0, 0, mvFieldNeighbours, interDirNeighbours, numValidMergeCand);
>
>      int mergeCandBuffer[MRG_MAX_NUM_CANDS];
> @@ -1260,7 +1322,7 @@
>                  {
>                      // set MC parameters
>                      outTempCU->setPredModeSubParts(MODE_INTER, 0, depth); // interprets depth relative to LCU level
> -                    outTempCU->setCUTransquantBypassSubParts(m_CUTransquantBypassFlagValue, 0, depth);
> +                    outTempCU->setCUTransquantBypassSubParts(bTransquantBypassFlag, 0, depth);
>                      outTempCU->setPartSizeSubParts(SIZE_2Nx2N, 0, depth); // interprets depth relative to LCU level
>                      outTempCU->setMergeFlag(0, true);
>                      outTempCU->setMergeIndex(0, mergeCand);
> @@ -1352,7 +1414,7 @@
>      outTempCU->setSkipFlagSubParts(false, 0, depth);
>      outTempCU->setPartSizeSubParts(partSize, 0, depth);
>      outTempCU->setPredModeSubParts(MODE_INTER, 0, depth);
> -    outTempCU->setCUTransquantBypassSubParts(m_CUTransquantBypassFlagValue, 0, depth);
> +
>
>      m_tmpRecoYuv[depth]->clear(); // TODO: Are either of these clears necessary?
>      m_tmpResiYuv[depth]->clear();
> @@ -1372,7 +1434,7 @@
>      outTempCU->setSkipFlagSubParts(false, 0, depth);
>      outTempCU->setPartSizeSubParts(partSize, 0, depth);
>      outTempCU->setPredModeSubParts(MODE_INTRA, 0, depth);
> -    outTempCU->setCUTransquantBypassSubParts(m_CUTransquantBypassFlagValue, 0, depth);
> +
>
>      m_search->estIntraPredQT(outTempCU, m_origYuv[depth], m_tmpPredYuv[depth], m_tmpResiYuv[depth], m_tmpRecoYuv[depth]);
>
> @@ -1411,7 +1473,7 @@
>      outTempCU->setSkipFlagSubParts(false, 0, depth);
>      outTempCU->setPartSizeSubParts(partSize, 0, depth);
>      outTempCU->setPredModeSubParts(MODE_INTRA, 0, depth);
> -    outTempCU->setCUTransquantBypassSubParts(m_CUTransquantBypassFlagValue, 0, depth);
> +
>
>      m_search->estIntraPredQT(outTempCU, m_origYuv[depth], m_tmpPredYuv[depth], m_tmpResiYuv[depth], m_tmpRecoYuv[depth]);
>
> @@ -1460,7 +1522,7 @@
>      outTempCU->setPartSizeSubParts(SIZE_2Nx2N, 0, depth);
>      outTempCU->setPredModeSubParts(MODE_INTRA, 0, depth);
>      outTempCU->setTrIdxSubParts(0, 0, depth);
> -    outTempCU->setCUTransquantBypassSubParts(m_CUTransquantBypassFlagValue, 0, depth);
> +
>
>      m_search->IPCMSearch(outTempCU, m_origYuv[depth], m_tmpPredYuv[depth], m_tmpResiYuv[depth], m_tmpRecoYuv[depth]);
>
> @@ -1567,8 +1629,8 @@
>      pixel* dstCr = cu->getPCMSampleCr();
>
>      uint32_t srcStrideC = fencYuv->getCStride();
> -    uint32_t heightC = height >> 1;
> -    uint32_t widthC = width >> 1;
> +    uint32_t widthC  = width  >> cu->getHorzChromaShift();
> +    uint32_t heightC = height >> cu->getVertChromaShift();

these look like unrelated bug fixes - should these (and similar
changes above) be in a separate patch?

>
>      for (int y = 0; y < heightC; y++)
>      {
> diff -r a4d0d5679c28 -r 8ce774039d12 source/common/param.cpp
> --- a/source/common/param.cpp   Sun May 11 17:32:37 2014 +0900
> +++ b/source/common/param.cpp   Tue May 13 17:23:32 2014 +0530
> @@ -127,6 +127,10 @@
>      param->bEnableConstrainedIntra = 0;
>      param->bEnableStrongIntraSmoothing = 1;
>
> +    /* Transquant Bypass */
> +    param->bEnableTransquantBypass = 0;
> +    param->bEnableCUTransquantBypass = 0;
> +

Should one of these be named bLossless?  The effect of each bool needs
to be better documented (not here, but in x265.h)

>      /* Inter Coding tools */
>      param->searchMethod = X265_HEX_SEARCH;
>      param->subpelRefine = 2;
> @@ -548,6 +552,8 @@
>      OPT("no-tskip-fast") p->bEnableTSkipFast = atobool(value);
>      OPT("tskip-fast") p->bEnableTSkipFast = atobool(value);
>      OPT("strong-intra-smoothing") p->bEnableStrongIntraSmoothing = atobool(value);
> +    OPT("transquant-bypass") p->bEnableTransquantBypass = atobool(value);
> +    OPT("cu-transquant-bypass") p->bEnableCUTransquantBypass = atobool(value);
>      OPT("constrained-intra") p->bEnableConstrainedIntra = atobool(value);
>      OPT("open-gop") p->bOpenGOP = atobool(value);
>      OPT("scenecut")
> @@ -890,6 +896,9 @@
>      CHECK(param->maxNumReferences < 1, "maxNumReferences must be 1 or greater.");
>      CHECK(param->maxNumReferences > MAX_NUM_REF, "maxNumReferences must be 16 or smaller.");
>
> +    CHECK((param->bEnableTransquantBypass == 0 && param->bEnableCUTransquantBypass == 1) || param->rdLevel < 5,
> +          "TransquantBypass flag must be enabled if CUTransquantBypass flag is signalled and RD Level must be greater then 5.");

I don't think we should be aborting encodes based on rdlevel.  But I
don't know enough about what the two flags actually do to suggest a
better way to configure this.

> +
>      CHECK(param->sourceWidth < (int)param->maxCUSize || param->sourceWidth < (int)param->maxCUSize,
>            "Picture size must be at least one CTU");
>      CHECK(param->internalCsp < X265_CSP_I420 || X265_CSP_I444 < param->internalCsp,
> @@ -1060,7 +1069,10 @@
>      }
>      x265_log(param, X265_LOG_INFO, "Lookahead / bframes / badapt        : %d / %d / %d\n", param->lookaheadDepth, param->bframes, param->bFrameAdaptive);
>      x265_log(param, X265_LOG_INFO, "b-pyramid / weightp / weightb / refs: %d / %d / %d / %d\n",
> -             param->bBPyramid, param->bEnableWeightedPred, param->bEnableWeightedBiPred, param->maxNumReferences);
> +        param->bBPyramid, param->bEnableWeightedPred, param->bEnableWeightedBiPred, param->maxNumReferences);
> +
> +    x265_log(param, X265_LOG_INFO, "transquant-bypass / cu-transquant-bypass: %d / %d\n", param->bEnableTransquantBypass, param->bEnableCUTransquantBypass);

if the options are disabled most of the time, this line should only be
displayed if one of them is enabled.

> +
>      switch (param->rc.rateControlMode)
>      {
>      case X265_RC_ABR:
> @@ -1140,6 +1152,8 @@
>      BOOL(p->bEnableTransformSkip, "tskip");
>      BOOL(p->bEnableTSkipFast, "tskip-fast");
>      BOOL(p->bEnableStrongIntraSmoothing, "strong-intra-smoothing");
> +    BOOL(p->bEnableTransquantBypass, "transquant-bypass");
> +    BOOL(p->bEnableCUTransquantBypass, "cu-transquant-bypass");
>      BOOL(p->bEnableConstrainedIntra, "constrained-intra");
>      BOOL(p->bOpenGOP, "open-gop");
>      s += sprintf(s, " interlace=%d", p->interlaceMode);
> diff -r a4d0d5679c28 -r 8ce774039d12 source/encoder/encoder.cpp
> --- a/source/encoder/encoder.cpp        Sun May 11 17:32:37 2014 +0900
> +++ b/source/encoder/encoder.cpp        Tue May 13 17:23:32 2014 +0530
> @@ -62,6 +62,8 @@
>      m_numChromaWPFrames = 0;
>      m_numLumaWPBiFrames = 0;
>      m_numChromaWPBiFrames = 0;
> +    m_TransquantBypassEnableFlag = false;
> +    m_CUTransquantBypassFlagValue = false;
>      m_lookahead = NULL;
>      m_frameEncoder = NULL;
>      m_rateControl = NULL;
> @@ -1442,8 +1444,14 @@
>      m_bPCMFilterDisableFlag = false;
>
>      m_useLossless = false;  // x264 configures this via --qp=0
> -    m_TransquantBypassEnableFlag = false;
> -    m_CUTransquantBypassFlagValue = false;
> +    if (p->bEnableTransquantBypass)
> +    {
> +        m_TransquantBypassEnableFlag  = true;
> +    }
> +    if (p->bEnableCUTransquantBypass)
> +    {
> +        m_CUTransquantBypassFlagValue = true;
> +    }

m_TransquantBypassEnableFlag and m_CUTransquantBypassFlagValue should
go away, and the code should be changed to use the new params directly

>  }
>
>  int Encoder::extractNalData(NALUnitEBSP **nalunits, int& memsize)
> diff -r a4d0d5679c28 -r 8ce774039d12 source/x265.cpp
> --- a/source/x265.cpp   Sun May 11 17:32:37 2014 +0900
> +++ b/source/x265.cpp   Tue May 13 17:23:32 2014 +0530
> @@ -157,6 +157,10 @@
>      { "strong-intra-smoothing",    no_argument, NULL, 0 },
>      { "no-cutree",                 no_argument, NULL, 0 },
>      { "cutree",                    no_argument, NULL, 0 },
> +    { "transquant-bypass",       no_argument, NULL, 0 },
> +    { "no-transquant-bypass",    no_argument, NULL, 0 },
> +    { "cu-transquant-bypass",    no_argument, NULL, 0 },
> +    { "no-cu-transquant-bypass", no_argument, NULL, 0 },
>      { "sar",            required_argument, NULL, 0 },
>      { "overscan",       required_argument, NULL, 0 },
>      { "videoformat",    required_argument, NULL, 0 },
> @@ -362,6 +366,8 @@
>      H0("   --ref <integer>               max number of L0 references to be allowed (1 .. 16) Default %d\n", param->maxNumReferences);
>      H0("-w/--[no-]weightp                Enable weighted prediction in P slices. Default %s\n", OPT(param->bEnableWeightedPred));
>      H0("   --[no-]weightb                Enable weighted prediction in B slices. Default %s\n", OPT(param->bEnableWeightedBiPred));
> +    H0("   --[no-]transquant-bypass      Enable transquant bypass flag. Default %s\n", OPT(param->bEnableTransquantBypass));
> +    H0("   --[no-]cu-transquant-bypass   Scaling, transform and in-loop filter process are bypassed. Default %s\n", OPT(param->bEnableCUTransquantBypass));
>      H0("\nRate control and rate distortion options:\n");
>      H0("   --bitrate <integer>           Target bitrate (kbps), implies ABR. Default %d\n", param->rc.bitrate);
>      H0("   --crf <float>                 Quality-based VBR (0-51). Default %f\n", param->rc.rfConstant);
> diff -r a4d0d5679c28 -r 8ce774039d12 source/x265.h
> --- a/source/x265.h     Sun May 11 17:32:37 2014 +0900
> +++ b/source/x265.h     Tue May 13 17:23:32 2014 +0530
> @@ -635,6 +635,9 @@
>       * Default is 0, which is recommended */
>      int       crQpOffset;
>
> +    int bEnableTransquantBypass;    /* transquant_bypass_enable_flag setting */
> +    int bEnableCUTransquantBypass;  /* if transquant_bypass_enable_flag is set, then all CU transquant bypass flags will be set to true. */

This doesn't tell the user anything useful.  The comments here need to
describe what affect the option has on the outputs, and if there is an
rdLevel requirement it must be clearly documented.  And in general
it's not clear we need to expose two different flags to the user,
especially if they have inter-dependencies.  This could probably be
exposed to the user more clearly with a single enumeration.

Any patch that changes the public API must bump X265_BUILD.

Any patch which adds a new CLI option must update doc/reST/cli.rst and
perhaps other pages of the user manual.

-- 
Steve Borho