[x265] cleanup m_sharedPredTransformSkip[]

Thu Mar 6 07:04:43 CET 2014

On Tue, Mar 4, 2014 at 4:40 AM, Satoshi Nakagawa <nakagawa424 at oki.com> wrote:
> # HG changeset patch
> # User Satoshi Nakagawa <nakagawa424 at oki.com>
> # Date 1393929339 -32400
> #      Tue Mar 04 19:35:39 2014 +0900
> # Node ID 7a61566806f691ddff84cbbc42801f6c2d46df88
> # Parent  3cbde0b893e34e5770cc311d3f4b6fe064c27774
> cleanup m_sharedPredTransformSkip[]
>
> NEW_CALCRECON macro is TODO mark for asm experts, to optimize register assignment.

Sorry I haven't responded to this yet; I would like Min to review it
before I push it.

> diff -r 3cbde0b893e3 -r 7a61566806f6 source/Lib/TLibEncoder/TEncSearch.cpp
> --- a/source/Lib/TLibEncoder/TEncSearch.cpp     Mon Mar 03 13:37:35 2014 -0600
> +++ b/source/Lib/TLibEncoder/TEncSearch.cpp     Tue Mar 04 19:35:39 2014 +0900
> @@ -63,7 +63,6 @@
>      m_qtTempTUCoeffCr = NULL;
>      for (int i = 0; i < 3; i++)
>      {
> -        m_sharedPredTransformSkip[i] = NULL;
>          m_qtTempTransformSkipFlag[i] = NULL;
>          m_qtTempCbf[i] = NULL;
>      }
> @@ -96,7 +95,6 @@
>      for (uint32_t i = 0; i < 3; ++i)
>      {
>          X265_FREE(m_qtTempCbf[i]);
> -        X265_FREE(m_sharedPredTransformSkip[i]);
>          X265_FREE(m_qtTempTransformSkipFlag[i]);
>      }
>
> @@ -153,9 +151,6 @@
>      CHECKED_MALLOC(m_qtTempTransformSkipFlag[1], uint8_t, numPartitions);
>      CHECKED_MALLOC(m_qtTempTransformSkipFlag[2], uint8_t, numPartitions);
>
> -    CHECKED_MALLOC(m_sharedPredTransformSkip[0], pixel, MAX_TS_WIDTH * MAX_TS_HEIGHT);
> -    CHECKED_MALLOC(m_sharedPredTransformSkip[1], pixel, MAX_TS_WIDTH * MAX_TS_HEIGHT);
> -    CHECKED_MALLOC(m_sharedPredTransformSkip[2], pixel, MAX_TS_WIDTH * MAX_TS_HEIGHT);
>      CHECKED_MALLOC(m_qtTempTUCoeffY, TCoeff, MAX_TS_WIDTH * MAX_TS_HEIGHT);
>      CHECKED_MALLOC(m_qtTempTUCoeffCb, TCoeff, MAX_TS_WIDTH * MAX_TS_HEIGHT);
>      CHECKED_MALLOC(m_qtTempTUCoeffCr, TCoeff, MAX_TS_WIDTH * MAX_TS_HEIGHT);
> @@ -414,7 +409,6 @@
>      Pel*     fenc         = fencYuv->getLumaAddr(absPartIdx);
>      Pel*     pred         = predYuv->getLumaAddr(absPartIdx);
>      int16_t* residual     = resiYuv->getLumaAddr(absPartIdx);
> -    Pel*     recon        = predYuv->getLumaAddr(absPartIdx);
>      int      chFmt        = cu->getChromaFormat();
>      int      part         = partitionFromSizes(width, height);
>
> @@ -439,15 +433,6 @@
>          cu->getPattern()->initAdiPattern(cu, absPartIdx, trDepth, m_predBuf, m_predBufStride, m_predBufHeight, m_refAbove, m_refLeft, m_refAboveFlt, m_refLeftFlt);
>          //===== get prediction signal =====
>          predIntraLumaAng(lumaPredMode, pred, stride, width);
> -        // save prediction
> -        if (default0Save1Load2 == 1)
> -        {
> -            primitives.luma_copy_pp[part](m_sharedPredTransformSkip[0], width, pred, stride);
> -        }
> -    }
> -    else
> -    {
> -        primitives.luma_copy_pp[part](pred, stride, m_sharedPredTransformSkip[0], width);
>      }
>
>      //===== get residual signal =====
> @@ -491,12 +476,19 @@
>          primitives.blockfill_s[size](resiTmp, stride, 0);
>      }
>
> +    assert(width <= 32);
> +#if NEW_CALCRECON
>      //===== reconstruction =====
> -    assert(width <= 32);
> +    primitives.calcrecon[size](pred, residual, 0, reconQt, reconIPred, stride, MAX_CU_SIZE, reconIPredStride);
> +    //===== update distortion =====
> +    outDist += primitives.sse_sp[part](reconQt, MAX_CU_SIZE, fenc, stride);
> +#else
> +    ALIGN_VAR_32(pixel, recon[MAX_CU_SIZE * MAX_CU_SIZE]);
> +    //===== reconstruction =====
>      primitives.calcrecon[size](pred, residual, recon, reconQt, reconIPred, stride, MAX_CU_SIZE, reconIPredStride);
> -
>      //===== update distortion =====
>      outDist += primitives.sse_pp[part](fenc, stride, recon, stride);
> +#endif
>  }
>
>  void TEncSearch::xIntraCodingChromaBlk(TComDataCU* cu,
> @@ -534,7 +526,6 @@
>      Pel*     fenc           = (chromaId > 0 ? fencYuv->getCrAddr(absPartIdx) : fencYuv->getCbAddr(absPartIdx));
>      Pel*     pred           = (chromaId > 0 ? predYuv->getCrAddr(absPartIdx) : predYuv->getCbAddr(absPartIdx));
>      int16_t* residual       = (chromaId > 0 ? resiYuv->getCrAddr(absPartIdx) : resiYuv->getCbAddr(absPartIdx));
> -    Pel*     recon          = (chromaId > 0 ? predYuv->getCrAddr(absPartIdx) : predYuv->getCbAddr(absPartIdx));
>
>      uint32_t qtlayer        = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - trSizeLog2;
>      uint32_t numCoeffPerInc = (cu->getSlice()->getSPS()->getMaxCUWidth() * cu->getSlice()->getSPS()->getMaxCUHeight() >> (cu->getSlice()->getSPS()->getMaxCUDepth() << 1)) >> (m_hChromaShift + m_vChromaShift);
> @@ -561,19 +552,6 @@
>
>          //===== get prediction signal =====
>          predIntraChromaAng(chromaPred, chromaPredMode, pred, stride, width, height, chFmt);
> -
> -        // save prediction
> -        if (default0Save1Load2 == 1)
> -        {
> -            Pel* predbuf = m_sharedPredTransformSkip[1 + chromaId];
> -            primitives.luma_copy_pp[part](predbuf, width, pred, stride);
> -        }
> -    }
> -    else
> -    {
> -        // load prediction
> -        Pel* predbuf = m_sharedPredTransformSkip[1 + chromaId];
> -        primitives.luma_copy_pp[part](pred, stride, predbuf, width);
>      }
>
>      //===== get residual signal =====
> @@ -627,12 +605,20 @@
>          }
>      }
>
> +    assert(((intptr_t)residual & (width - 1)) == 0);
> +    assert(width <= 32);
> +#if NEW_CALCRECON
>      //===== reconstruction =====
> -    assert(((uint32_t)(size_t)residual & (width - 1)) == 0);
> -    assert(width <= 32);
> +    primitives.calcrecon[size](pred, residual, 0, reconQt, reconIPred, stride, reconQtStride, reconIPredStride);
> +    //===== update distortion =====
> +    uint32_t dist = primitives.sse_sp[part](reconQt, reconQtStride, fenc, stride);
> +#else
> +    ALIGN_VAR_32(pixel, recon[MAX_CU_SIZE * MAX_CU_SIZE]);
> +    //===== reconstruction =====
>      primitives.calcrecon[size](pred, residual, recon, reconQt, reconIPred, stride, reconQtStride, reconIPredStride);
>      //===== update distortion =====
>      uint32_t dist = primitives.sse_pp[part](fenc, stride, recon, stride);
> +#endif
>      if (ttype == TEXT_CHROMA_U)
>      {
>          outDist += m_rdCost->scaleChromaDistCb(dist);
> diff -r 3cbde0b893e3 -r 7a61566806f6 source/Lib/TLibEncoder/TEncSearch.h
> --- a/source/Lib/TLibEncoder/TEncSearch.h       Mon Mar 03 13:37:35 2014 -0600
> +++ b/source/Lib/TLibEncoder/TEncSearch.h       Tue Mar 04 19:35:39 2014 +0900
> @@ -84,7 +84,6 @@
>  protected:
>
>      ShortYuv*      m_qtTempShortYuv;
> -    pixel*          m_sharedPredTransformSkip[3];
>
>      TCoeff**        m_qtTempCoeffY;
>      TCoeff**        m_qtTempCoeffCb;
> diff -r 3cbde0b893e3 -r 7a61566806f6 source/common/pixel.cpp
> --- a/source/common/pixel.cpp   Mon Mar 03 13:37:35 2014 -0600
> +++ b/source/common/pixel.cpp   Tue Mar 04 19:35:39 2014 +0900
> @@ -460,20 +460,33 @@
>  }
>
>  template<int blockSize>
> -void calcRecons(pixel* pred, int16_t* residual, pixel* recon, int16_t* recqt, pixel* recipred, int stride, int qtstride, int ipredstride)
> +void calcRecons(pixel* pred, int16_t* residual,
> +#if NEW_CALCRECON
> +                pixel*,
> +#else
> +                pixel* recon,
> +#endif
> +                int16_t* recqt, pixel* recipred, int stride, int qtstride, int ipredstride)
>  {
>      for (int uiY = 0; uiY < blockSize; uiY++)
>      {
>          for (int uiX = 0; uiX < blockSize; uiX++)
>          {
> +#if NEW_CALCRECON
> +            recqt[uiX] = (int16_t)ClipY(static_cast<int16_t>(pred[uiX]) + residual[uiX]);
> +            recipred[uiX] = (pixel)recqt[uiX];
> +#else
>              recon[uiX] = (pixel)ClipY(static_cast<int16_t>(pred[uiX]) + residual[uiX]);
>              recqt[uiX] = (int16_t)recon[uiX];
>              recipred[uiX] = recon[uiX];
> +#endif
>          }
>
>          pred += stride;
>          residual += stride;
> +#if !NEW_CALCRECON
>          recon += stride;
> +#endif
>          recqt += qtstride;
>          recipred += ipredstride;
>      }
> diff -r 3cbde0b893e3 -r 7a61566806f6 source/common/primitives.h
> --- a/source/common/primitives.h        Mon Mar 03 13:37:35 2014 -0600
> +++ b/source/common/primitives.h        Tue Mar 04 19:35:39 2014 +0900
> @@ -34,6 +34,8 @@
>  #include "cpu.h"
>  #include "x265.h"
>
> +#define NEW_CALCRECON 1 // TODO: remove recon[] arg
> +
>  #define FENC_STRIDE 64
>
>  #define NUM_INTRA_MODE 35
> diff -r 3cbde0b893e3 -r 7a61566806f6 source/common/x86/pixel-util8.asm
> --- a/source/common/x86/pixel-util8.asm Mon Mar 03 13:37:35 2014 -0600
> +++ b/source/common/x86/pixel-util8.asm Tue Mar 04 19:35:39 2014 +0900
> @@ -57,6 +57,7 @@
>  cextern pw_2000
>  cextern pw_pixel_max
>
> +%define NEW_CALCRECON 1 ; TODO: remove recon[] arg
>  ;-----------------------------------------------------------------------------
>  ; void calcrecon(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred)
>  ;-----------------------------------------------------------------------------
> @@ -101,7 +102,9 @@
>      CLIPW       m0, m4, m5
>
>      ; store recon[] and recipred[]
> +%if NEW_CALCRECON == 0
>      movh        [t2], m0
> +%endif
>      movh        [t4], m0
>  %if ARCH_X86_64 == 0
>      add         t4, t7
> @@ -113,7 +116,9 @@
>      movhps      [t4 + t7], m0
>      lea         t4, [t4 + t7 * 2]
>  %endif
> +%if NEW_CALCRECON == 0
>      movhps      [t2 + t5], m0
> +%endif
>
>      ; store recqt[]
>      movh        [t3], m0
> @@ -123,7 +128,9 @@
>
>      lea         t0, [t0 + t5 * 2]
>      lea         t1, [t1 + t5 * 2]
> +%if NEW_CALCRECON == 0
>      lea         t2, [t2 + t5 * 2]
> +%endif
>
>      dec         t8d
>      jnz        .loop
> @@ -165,11 +172,15 @@
>      packuswb    m1, m1
>
>      ; store recon[] and recipred[]
> +%if NEW_CALCRECON == 0
>      movd        [t2], m1
> +%endif
>      movd        [t4], m1
>      add         t4, t7
>      pshufd      m2, m1, 1
> +%if NEW_CALCRECON == 0
>      movd        [t2 + t5], m2
> +%endif
>      movd        [t4], m2
>      add         t4, t7
>
> @@ -182,7 +193,9 @@
>
>      lea         t0, [t0 + t5 * 2]
>      lea         t1, [t1 + t5 * 4]
> +%if NEW_CALCRECON == 0
>      lea         t2, [t2 + t5 * 2]
> +%endif
>
>      dec         t8d
>      jnz        .loop
> @@ -231,8 +244,10 @@
>      CLIPW       m1, m4, m5
>
>      ; store recon[] and recipred[]
> +%if NEW_CALCRECON == 0
>      movu        [t2], m0
>      movu        [t2 + t5], m1
> +%endif
>      movu        [t4], m0
>  %if ARCH_X86_64 == 0
>      add         t4, t7
> @@ -253,7 +268,9 @@
>
>      lea         t0, [t0 + t5 * 2]
>      lea         t1, [t1 + t5 * 2]
> +%if NEW_CALCRECON == 0
>      lea         t2, [t2 + t5 * 2]
> +%endif
>
>      dec         t8d
>      jnz        .loop
> @@ -295,8 +312,10 @@
>      packuswb    m1, m2
>
>      ; store recon[] and recipred[]
> +%if NEW_CALCRECON == 0
>      movlps      [t2], m1
>      movhps      [t2 + t5], m1
> +%endif
>      movlps      [t4], m1
>  %if ARCH_X86_64 == 0
>      add         t4, t7
> @@ -317,7 +336,9 @@
>
>      lea         t0, [t0 + t5 * 2]
>      lea         t1, [t1 + t5 * 4]
> +%if NEW_CALCRECON == 0
>      lea         t2, [t2 + t5 * 2]
> +%endif
>
>      dec         t8d
>      jnz        .loop
> @@ -367,8 +388,10 @@
>      CLIPW       m1, m4, m5
>
>      ; store recon[] and recipred[]
> +%if NEW_CALCRECON == 0
>      movu        [t2], m0
>      movu        [t2 + 16], m1
> +%endif
>      movu        [t4], m0
>      movu        [t4 + 16], m1
>  %if ARCH_X86_64 == 0
> @@ -391,8 +414,10 @@
>      CLIPW       m1, m4, m5
>
>      ; store recon[] and recipred[]
> +%if NEW_CALCRECON == 0
>      movu        [t2 + t5], m0
>      movu        [t2 + t5 + 16], m1
> +%endif
>  %if ARCH_X86_64 == 0
>      movu        [t4], m0
>      movu        [t4 + 16], m1
> @@ -411,7 +436,9 @@
>
>      lea         t0, [t0 + t5 * 2]
>      lea         t1, [t1 + t5 * 2]
> +%if NEW_CALCRECON == 0
>      lea         t2, [t2 + t5 * 2]
> +%endif
>
>      dec         t8d
>      jnz        .loop
> @@ -451,7 +478,9 @@
>      packuswb    m1, m2
>
>      ; store recon[] and recipred[]
> +%if NEW_CALCRECON == 0
>      movu        [t2], m1
> +%endif
>      movu        [t4], m1
>
>      ; store recqt[]
> @@ -464,7 +493,9 @@
>      add         t4, t7
>      add         t0, t5
>      lea         t1, [t1 + t5 * 2]
> +%if NEW_CALCRECON == 0
>      add         t2, t5
> +%endif
>
>      dec         t8d
>      jnz        .loop
> @@ -513,8 +544,10 @@
>      CLIPW       m1, m4, m5
>
>      ; store recon[] and recipred[]
> +%if NEW_CALCRECON == 0
>      movu        [t2], m0
>      movu        [t2 + 16], m1
> +%endif
>      movu        [t4], m0
>      movu        [t4 + 16], m1
>
> @@ -532,8 +565,10 @@
>      CLIPW       m1, m4, m5
>
>      ; store recon[] and recipred[]
> +%if NEW_CALCRECON == 0
>      movu        [t2 + 32], m0
>      movu        [t2 + 48], m1
> +%endif
>      movu        [t4 + 32], m0
>      movu        [t4 + 48], m1
>  %if ARCH_X86_64 == 0
> @@ -556,8 +591,10 @@
>      CLIPW       m1, m4, m5
>
>      ; store recon[] and recipred[]
> +%if NEW_CALCRECON == 0
>      movu        [t2 + t5], m0
>      movu        [t2 + t5 + 16], m1
> +%endif
>  %if ARCH_X86_64 == 0
>      movu        [t4], m0
>      movu        [t4 + 16], m1
> @@ -580,8 +617,10 @@
>      CLIPW       m1, m4, m5
>
>      ; store recon[] and recipred[]
> +%if NEW_CALCRECON == 0
>      movu        [t2 + t5 + 32], m0
>      movu        [t2 + t5 + 48], m1
> +%endif
>  %if ARCH_X86_64 == 0
>      movu        [t4 + 32], m0
>      movu        [t4 + 48], m1
> @@ -600,7 +639,9 @@
>
>      lea         t0, [t0 + t5 * 2]
>      lea         t1, [t1 + t5 * 2]
> +%if NEW_CALCRECON == 0
>      lea         t2, [t2 + t5 * 2]
> +%endif
>
>      dec         t8d
>      jnz        .loop
> @@ -648,8 +689,10 @@
>      packuswb    m3, m4
>
>      ; store recon[] and recipred[]
> +%if NEW_CALCRECON == 0
>      movu        [t2], m1
>      movu        [t2 + 16], m3
> +%endif
>      movu        [t4], m1
>      movu        [t4 + 16], m3
>
> @@ -667,7 +710,9 @@
>      add         t4, t7
>      add         t0, t5
>      lea         t1, [t1 + t5 * 2]
> +%if NEW_CALCRECON == 0
>      add         t2, t5
> +%endif
>
>      dec         t8d
>      jnz        .loop
> diff -r 3cbde0b893e3 -r 7a61566806f6 source/test/pixelharness.cpp
> --- a/source/test/pixelharness.cpp      Mon Mar 03 13:37:35 2014 -0600
> +++ b/source/test/pixelharness.cpp      Tue Mar 04 19:35:39 2014 +0900
> @@ -351,10 +351,12 @@
>          {
>              return false;
>          }
> +#if !NEW_CALCRECON
>          if (memcmp(ref_reco, opt_reco, 64 * 64 * sizeof(pixel)))
>          {
>              return false;
>          }
> +#endif
>          if (memcmp(ref_pred, opt_pred, 64 * 64 * sizeof(pixel)))
>          {
>              return false;
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel

-- 
Steve Borho