[x265] [PATCH] shortyuv: integrated asm primitives for blockcopy

Steve Borho steve at borho.org
Fri Mar 7 20:20:44 CET 2014


On Fri, Mar 7, 2014 at 5:45 AM,  <murugan at multicorewareinc.com> wrote:
> # HG changeset patch
> # User Murugan Vairavel <murugan at multicorewareinc.com>
> # Date 1394192688 -19800
> #      Fri Mar 07 17:14:48 2014 +0530
> # Node ID 5c626bf1e275596b45808c14952bd5aea8aaeb3e
> # Parent  2bf727dca27d6f69e96d4412850661cbe036cbef
> shortyuv: integrated asm primitives for blockcopy
>
> diff -r 2bf727dca27d -r 5c626bf1e275 source/common/shortyuv.cpp
> --- a/source/common/shortyuv.cpp        Fri Mar 07 15:11:13 2014 +0530
> +++ b/source/common/shortyuv.cpp        Fri Mar 07 17:14:48 2014 +0530
> @@ -129,32 +129,27 @@
>
>  void ShortYuv::copyPartToPartLuma(ShortYuv* dstPicYuv, uint32_t partIdx, uint32_t width, uint32_t height)
>  {
> +    int part = partitionFromSizes(width, height);
>      int16_t* src = getLumaAddr(partIdx);
>      int16_t* dst = dstPicYuv->getLumaAddr(partIdx);
>
>      uint32_t srcStride = m_width;
>      uint32_t dstStride = dstPicYuv->m_width;
> -#if HIGH_BIT_DEPTH
> -    primitives.blockcpy_pp(width, height, (pixel*)dst, dstStride, (pixel*)src, srcStride);
> -#else
> -    for (uint32_t y = height; y != 0; y--)
> -    {
> -        ::memcpy(dst, src, width * sizeof(int16_t));
> -        src += srcStride;
> -        dst += dstStride;
> -    }
> -#endif
> +
> +    primitives.luma_copy_ss[part](dst, dstStride, src, srcStride);
> +
>  }
>
>  void ShortYuv::copyPartToPartLuma(TComYuv* dstPicYuv, uint32_t partIdx, uint32_t width, uint32_t height)
>  {
> +    int part = partitionFromSizes(width, height);
>      int16_t* src = getLumaAddr(partIdx);
>      pixel* dst = dstPicYuv->getLumaAddr(partIdx);
>
>      uint32_t srcStride = m_width;
>      uint32_t dstStride = dstPicYuv->getStride();
>
> -    primitives.blockcpy_ps(width, height, dst, dstStride, src, srcStride);
> +    primitives.luma_copy_sp[part](dst, dstStride, src, srcStride);
>  }
>
>  void ShortYuv::copyPartToPartChroma(ShortYuv* dstPicYuv, uint32_t partIdx, uint32_t width, uint32_t height)
> @@ -163,23 +158,15 @@
>      int16_t* srcV = getCrAddr(partIdx);
>      int16_t* dstU = dstPicYuv->getCbAddr(partIdx);
>      int16_t* dstV = dstPicYuv->getCrAddr(partIdx);
> +    width  = width  << m_hChromaShift;
> +    height = height << m_vChromaShift;
> +    int part = partitionFromSizes(width, height);
>
>      uint32_t srcStride = m_cwidth;
>      uint32_t dstStride = dstPicYuv->m_cwidth;
> -#if HIGH_BIT_DEPTH
> -    primitives.blockcpy_pp(width, height, (pixel*)dstU, dstStride, (pixel*)srcU, srcStride);
> -    primitives.blockcpy_pp(width, height, (pixel*)dstV, dstStride, (pixel*)srcV, srcStride);
> -#else
> -    for (uint32_t y = height; y != 0; y--)
> -    {
> -        ::memcpy(dstU, srcU, width * sizeof(int16_t));
> -        ::memcpy(dstV, srcV, width * sizeof(int16_t));
> -        srcU += srcStride;
> -        srcV += srcStride;
> -        dstU += dstStride;
> -        dstV += dstStride;
> -    }
> -#endif
> +
> +    primitives.chroma[m_csp].copy_ss[part](dstU, dstStride, srcU, srcStride);
> +    primitives.chroma[m_csp].copy_ss[part](dstV, dstStride, srcV, srcStride);


I'm pretty sure this is wrong.  The chroma primitives are indexed by
color space and luma partition size, so the function pointer itself
scales the luma dimensions down to the correct chroma dimensions for
that color space.

So essentially this function cannot use the chroma block copy
functions without changing the function arguments and callers to pass
in luma dimensions or the luma partition enum.

>  }
>
>  void ShortYuv::copyPartToPartChroma(TComYuv* dstPicYuv, uint32_t partIdx, uint32_t width, uint32_t height)
> @@ -188,28 +175,29 @@
>      int16_t* srcV = getCrAddr(partIdx);
>      pixel* dstU = dstPicYuv->getCbAddr(partIdx);
>      pixel* dstV = dstPicYuv->getCrAddr(partIdx);
> +    width  = width  << m_hChromaShift;
> +    height = height << m_vChromaShift;
> +    int part = partitionFromSizes(width, height);
>
>      uint32_t srcStride = m_cwidth;
>      uint32_t dstStride = dstPicYuv->getCStride();
>
> -    primitives.blockcpy_ps(width, height, dstU, dstStride, srcU, srcStride);
> -    primitives.blockcpy_ps(width, height, dstV, dstStride, srcV, srcStride);
> +    primitives.chroma[m_csp].copy_sp[part](dstU, dstStride, srcU, srcStride);
> +    primitives.chroma[m_csp].copy_sp[part](dstV, dstStride, srcV, srcStride);

ditto

>  }
>
>  void ShortYuv::copyPartToPartChroma(ShortYuv* dstPicYuv, uint32_t partIdx, uint32_t width, uint32_t height, uint32_t chromaId)
>  {
> +    width  = width  << m_hChromaShift;
> +    height = height << m_vChromaShift;
> +    int part = partitionFromSizes(width, height);
>      if (chromaId == 0)
>      {
>          int16_t* srcU = getCbAddr(partIdx);
>          int16_t* dstU = dstPicYuv->getCbAddr(partIdx);
>          uint32_t srcStride = m_cwidth;
>          uint32_t dstStride = dstPicYuv->m_cwidth;
> -        for (uint32_t y = height; y != 0; y--)
> -        {
> -            ::memcpy(dstU, srcU, width * sizeof(int16_t));
> -            srcU += srcStride;
> -            dstU += dstStride;
> -        }
> +        primitives.chroma[m_csp].copy_ss[part](dstU, dstStride, srcU, srcStride);
>      }
>      else if (chromaId == 1)
>      {
> @@ -217,12 +205,7 @@
>          int16_t* dstV = dstPicYuv->getCrAddr(partIdx);
>          uint32_t srcStride = m_cwidth;
>          uint32_t dstStride = dstPicYuv->m_cwidth;
> -        for (uint32_t y = height; y != 0; y--)
> -        {
> -            ::memcpy(dstV, srcV, width * sizeof(int16_t));
> -            srcV += srcStride;
> -            dstV += dstStride;
> -        }
> +        primitives.chroma[m_csp].copy_ss[part](dstV, dstStride, srcV, srcStride);
>      }
>      else
>      {
> @@ -232,27 +215,23 @@
>          int16_t* dstV = dstPicYuv->getCrAddr(partIdx);
>          uint32_t srcStride = m_cwidth;
>          uint32_t dstStride = dstPicYuv->m_cwidth;
> -        for (uint32_t y = height; y != 0; y--)
> -        {
> -            ::memcpy(dstU, srcU, width * sizeof(int16_t));
> -            ::memcpy(dstV, srcV, width * sizeof(int16_t));
> -            srcU += srcStride;
> -            srcV += srcStride;
> -            dstU += dstStride;
> -            dstV += dstStride;
> -        }
> +        primitives.chroma[m_csp].copy_ss[part](dstU, dstStride, srcU, srcStride);
> +        primitives.chroma[m_csp].copy_ss[part](dstV, dstStride, srcV, srcStride);

and... ditto

>      }
>  }
>
>  void ShortYuv::copyPartToPartChroma(TComYuv* dstPicYuv, uint32_t partIdx, uint32_t width, uint32_t height, uint32_t chromaId)
>  {
> +    width  = width  << m_hChromaShift;
> +    height = height << m_vChromaShift;
> +    int part = partitionFromSizes(width, height);
>      if (chromaId == 0)
>      {
>          int16_t* srcU = getCbAddr(partIdx);
>          pixel* dstU = dstPicYuv->getCbAddr(partIdx);
>          uint32_t srcStride = m_cwidth;
>          uint32_t dstStride = dstPicYuv->getCStride();
> -        primitives.blockcpy_ps(width, height, dstU, dstStride, srcU, srcStride);
> +        primitives.chroma[m_csp].copy_sp[part](dstU, dstStride, srcU, srcStride);
>      }
>      else if (chromaId == 1)
>      {
> @@ -260,7 +239,7 @@
>          pixel* dstV = dstPicYuv->getCrAddr(partIdx);
>          uint32_t srcStride = m_cwidth;
>          uint32_t dstStride = dstPicYuv->getCStride();
> -        primitives.blockcpy_ps(width, height, dstV, dstStride, srcV, srcStride);
> +        primitives.chroma[m_csp].copy_sp[part](dstV, dstStride, srcV, srcStride);
>      }
>      else
>      {
> @@ -271,7 +250,7 @@
>
>          uint32_t srcStride = m_cwidth;
>          uint32_t dstStride = dstPicYuv->getCStride();
> -        primitives.blockcpy_ps(width, height, dstU, dstStride, srcU, srcStride);
> -        primitives.blockcpy_ps(width, height, dstV, dstStride, srcV, srcStride);
> +        primitives.chroma[m_csp].copy_sp[part](dstU, dstStride, srcU, srcStride);
> +        primitives.chroma[m_csp].copy_sp[part](dstV, dstStride, srcV, srcStride);

still busted.

Have a look at how the TComYuv methods were modified to handle this.

-- 
Steve Borho


More information about the x265-devel mailing list