[x265] [PATCH] Separated the logic block of code as standalone function for vectorization

Tue Jun 25 19:26:45 CEST 2013

On Tue, Jun 25, 2013 at 7:41 AM, <praveen at multicorewareinc.com> wrote:

> # HG changeset patch
> # User praveentiwari
> # Date 1372164077 -19800
> # Node ID 08a1bf9ba9bcd79e62059615f99aa86fee0d4f79
> # Parent  4e7a3fc1c49854ab89de035b6bc9690627463d69
> Separated the logic block of code as standalone function for vectorization
>
> diff -r 4e7a3fc1c498 -r 08a1bf9ba9bc source/Lib/TLibCommon/TComTrQuant.cpp
> --- a/source/Lib/TLibCommon/TComTrQuant.cpp     Tue Jun 25 17:38:30 2013
> +0530
> +++ b/source/Lib/TLibCommon/TComTrQuant.cpp     Tue Jun 25 18:11:17 2013
> +0530
> @@ -739,6 +739,43 @@
>      } // TU loop
>  }
>

This should be a C-ref primitive in a file in common/, either dct.cpp or in
a new quant.cpp.

+void xCalQuantCoef(Int *  piCoef,
> +                   Int *  piQuantCoeff,
> +                   Int *  deltaU,
> +                   Int *  piQCoef,
> +                   Int *  piArlCCoef,
> +                   UInt & uiAcSum,
> +                   Int    iQBitsC,
> +                   Int    iQBits,
> +                   Int    iAdd,
> +                   Int    iWidth,
> +                   Int    iHeight,
> +                   Bool   m_bUseAdaptQpSelect)
>

Drop the m_ prefix, that function argument is not a member variable.   And
our performance primitives need to be "cleaned" of the HM style.  This
means drop all the hungarian notation prefixes ("pi for pointer to
integer", "ui prefix for unsigned integer", "i for signed integer"). For
example, iQBitsC -> qbitsc.

The accumulated sum should be the return value from this primitive, not
passed as a reference.

It doesn't seem necessary to pass width and height, it only needs the total
block size (width * height).

> +{
> +    Int iAddC   = 1 << (iQBitsC - 1);
> +    Int qBits8 = iQBits - 8;
> +
> +    for (Int n = 0; n < iWidth * iHeight; n++)
> +    {
> +        Int iLevel;
> +        Int  iSign;
> +        UInt uiBlockPos = n;
>

just use n.  declaring uiBlockPos here is just silly (and yes I know this
came directly from the HM routine).

> +        iLevel  = piCoef[uiBlockPos];
> +        iSign   = (iLevel < 0 ? -1 : 1);
> +
> +        Int64 tmpLevel = (Int64)abs(iLevel) * piQuantCoeff[uiBlockPos];
> +        if (m_bUseAdaptQpSelect)
> +        {
> +            piArlCCoef[uiBlockPos] = (Int)((tmpLevel + iAddC) >> iQBitsC);
> +        }
> +        iLevel = (Int)((tmpLevel + iAdd) >> iQBits);
> +        deltaU[uiBlockPos] = (Int)((tmpLevel - (iLevel << iQBits)) >>
> qBits8);
> +        uiAcSum += iLevel;
> +        iLevel *= iSign;
> +        piQCoef[uiBlockPos] = Clip3(-32768, 32767, iLevel);
> +    }
> +}
>
>
I think there should be two primitives for Quant, one with adaptive QP
select, and one without.  The one without should have a lot fewer arguments.

>  Void TComTrQuant::xQuant(TComDataCU* pcCU,
>                           Int*        pSrc,
>                           TCoeff*     pDes,
> @@ -818,28 +855,8 @@
>          Int iQBits = QUANT_SHIFT + cQpBase.m_iPer + iTransformShift;
>          iAdd = (pcCU->getSlice()->getSliceType() == I_SLICE ? 171 : 85)
> << (iQBits - 9);
>          Int iQBitsC = QUANT_SHIFT + cQpBase.m_iPer + iTransformShift -
> ARL_C_PRECISION;
> -        Int iAddC   = 1 << (iQBitsC - 1);
>
> -        Int qBits8 = iQBits - 8;
> -        for (Int n = 0; n < iWidth * iHeight; n++)
> -        {
> -            Int iLevel;
> -            Int  iSign;
> -            UInt uiBlockPos = n;
> -            iLevel  = piCoef[uiBlockPos];
> -            iSign   = (iLevel < 0 ? -1 : 1);
> -
> -            Int64 tmpLevel = (Int64)abs(iLevel) *
> piQuantCoeff[uiBlockPos];
> -            if (m_bUseAdaptQpSelect)
> -            {
> -                piArlCCoef[uiBlockPos] = (Int)((tmpLevel + iAddC) >>
> iQBitsC);
> -            }
> -            iLevel = (Int)((tmpLevel + iAdd) >> iQBits);
> -            deltaU[uiBlockPos] = (Int)((tmpLevel - (iLevel << iQBits)) >>
> qBits8);
> -            uiAcSum += iLevel;
> -            iLevel *= iSign;
> -            piQCoef[uiBlockPos] = Clip3(-32768, 32767, iLevel);
> -        } // for n
> +        xCalQuantCoef(piCoef, piQuantCoeff, deltaU, piQCoef, piArlCCoef,
> uiAcSum, iQBitsC, iQBits, iAdd, iWidth, iHeight, m_bUseAdaptQpSelect);
>
>          if (pcCU->getSlice()->getPPS()->getSignHideFlag())
>          {
> @@ -976,7 +993,7 @@
>          assert(bitDepth == 8);
>
>          const UInt uiLog2BlockSize = g_aucConvertToBit[uiWidth];
>

This patch has a number of white-space fixes; they need to be separated
into their own patch.

> -        x265::primitives.dct[x265::DCT_4x4 + uiLog2BlockSize -
> ((uiWidth==4) && (uiMode != REG_DCT))](pcResidual, m_plTempCoeff, uiStride);
> +        x265::primitives.dct[x265::DCT_4x4 + uiLog2BlockSize - ((uiWidth
> == 4) && (uiMode != REG_DCT))](pcResidual, m_plTempCoeff, uiStride);
>
>          assert(uiWidth == uiHeight);
>      }
> @@ -1021,10 +1038,10 @@
>      else
>      {
>          // ChECK_ME: I assume we don't use HIGH_BIT_DEPTH here
> -        assert( bitDepth == 8 );
> +        assert(bitDepth == 8);
>
>          const UInt uiLog2BlockSize = g_aucConvertToBit[uiWidth];
> -        x265::primitives.idct[x265::IDCT_4x4 + uiLog2BlockSize -
> ((uiWidth==4) && (uiMode != REG_DCT))](m_plTempCoeff, rpcResidual,
> uiStride);
> +        x265::primitives.idct[x265::IDCT_4x4 + uiLog2BlockSize -
> ((uiWidth == 4) && (uiMode != REG_DCT))](m_plTempCoeff, rpcResidual,
> uiStride);
>      }
>  }
>
> @@ -1093,11 +1110,11 @@
>  Void TComTrQuant::xIT(Int bitDepth, UInt uiMode, Int* plCoef, Short*
> pResidual, UInt uiStride, Int iWidth, Int iHeight)
>  {
>      // ChECK_ME: I assume we don't use HIGH_BIT_DEPTH here
> -    assert( bitDepth == 8 );
> +    assert(bitDepth == 8);
>
>      //xITrMxN(bitDepth, coeff, block, iWidth, iHeight, uiMode);
>      const UInt uiLog2BlockSize = g_aucConvertToBit[iWidth];
> -    x265::primitives.idct[x265::IDCT_4x4 + uiLog2BlockSize - ((iWidth==4)
> && (uiMode != REG_DCT))](plCoef, pResidual, uiStride);
> +    x265::primitives.idct[x265::IDCT_4x4 + uiLog2BlockSize - ((iWidth ==
> 4) && (uiMode != REG_DCT))](plCoef, pResidual, uiStride);
>  }
>
>  /** Wrapper function between HM interface and core 4x4 transform skipping
> @@ -1850,12 +1867,12 @@
>   * \param ui16AbsGoRice Rice parameter for coeff_abs_level_minus3
>   * \returns cost of given absolute transform level
>   */
> -__inline Double TComTrQuant::xGetICRateCost(UInt uiAbsLevel,
> +__inline Double TComTrQuant::xGetICRateCost(UInt   uiAbsLevel,
>                                              UShort ui16CtxNumOne,
>                                              UShort ui16CtxNumAbs,
>                                              UShort ui16AbsGoRice,
> -                                            UInt c1Idx,
> -                                            UInt c2Idx) const
> +                                            UInt   c1Idx,
> +                                            UInt   c2Idx) const
>  {
>      Double iRate = xGetIEPRate();
>      UInt baseLevel  =  (c1Idx < C1FLAG_NUMBER) ? (2 + (c2Idx <
> C2FLAG_NUMBER)) : 1;
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> http://mailman.videolan.org/listinfo/x265-devel
>

-- 
Steve Borho
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20130625/808a926a/attachment-0001.html>