[x265] [PATCH] shortyuv: integrated asm primitives for blockcopy

murugan at multicorewareinc.com murugan at multicorewareinc.com
Mon Mar 10 12:29:34 CET 2014


# HG changeset patch
# User Murugan Vairavel <murugan at multicorewareinc.com>
# Date 1394450962 -19800
#      Mon Mar 10 16:59:22 2014 +0530
# Node ID 4c8c208d66f2a159336fd9aa4522cb4aafe013d7
# Parent  50d7910ddd61632deeed969cf8e474561b175622
shortyuv: integrated asm primitives for blockcopy

diff -r 50d7910ddd61 -r 4c8c208d66f2 source/Lib/TLibEncoder/TEncSearch.cpp
--- a/source/Lib/TLibEncoder/TEncSearch.cpp	Sun Mar 09 22:10:39 2014 -0500
+++ b/source/Lib/TLibEncoder/TEncSearch.cpp	Mon Mar 10 16:59:22 2014 +0530
@@ -1110,8 +1110,7 @@
         m_qtTempShortYuv[qtlayer].copyPartToPartLuma(reconYuv, absPartIdx, 1 << trSizeLog2, 1 << trSizeLog2);
         if (!bLumaOnly && !bSkipChroma)
         {
-            uint32_t trSizeCLog2 = (bChromaSame ? trSizeLog2 : trSizeLog2 - 1);
-            m_qtTempShortYuv[qtlayer].copyPartToPartChroma(reconYuv, absPartIdx, 1 << trSizeCLog2, 1 << trSizeCLog2);
+            m_qtTempShortYuv[qtlayer].copyPartToPartChroma(reconYuv, absPartIdx, 1 << trSizeLog2, bChromaSame);
         }
     }
     else
@@ -1166,8 +1165,7 @@
 
     if (!bLumaOnly && !bSkipChroma)
     {
-        uint32_t trSizeCLog2 = (bChromaSame ? trSizeLog2 : trSizeLog2 - 1);
-        m_qtTempShortYuv[qtlayer].copyPartToPartChroma(&m_qtTempTransformSkipYuv, absPartIdx, 1 << trSizeCLog2, 1 << trSizeCLog2);
+        m_qtTempShortYuv[qtlayer].copyPartToPartChroma(&m_qtTempTransformSkipYuv, absPartIdx, 1 << trSizeLog2, bChromaSame);
     }
 }
 
@@ -1280,8 +1278,8 @@
         }
 
         //===== copy reconstruction =====
-        uint32_t trSizeCLog2 = (bChromaSame ? trSizeLog2 : trSizeLog2 - 1);
-        m_qtTempShortYuv[qtlayer].copyPartToPartChroma(&m_qtTempTransformSkipYuv, absPartIdx, 1 << trSizeCLog2, 1 << trSizeCLog2, stateU0V1Both2);
+        uint32_t lumaSize = 1 << (bChromaSame ? trSizeLog2 + 1 : trSizeLog2);
+        m_qtTempShortYuv[qtlayer].copyPartToPartYuvChroma(&m_qtTempTransformSkipYuv, absPartIdx, lumaSize, stateU0V1Both2);
     }
 }
 
@@ -1538,8 +1536,7 @@
         ::memcpy(coeffDstV, coeffSrcV, sizeof(TCoeff) * numCoeffC);
 
         //===== copy reconstruction =====
-        uint32_t trSizeCLog2 = (bChromaSame || (chFmt == CHROMA_444))  ? trSizeLog2 : trSizeLog2 - 1;
-        m_qtTempShortYuv[qtlayer].copyPartToPartChroma(reconYuv, absPartIdx, 1 << trSizeCLog2, 1 << trSizeCLog2);
+        m_qtTempShortYuv[qtlayer].copyPartToPartChroma(reconYuv, absPartIdx, 1 << trSizeLog2, bChromaSame);
     }
     else
     {
@@ -4155,6 +4152,7 @@
         const uint32_t qtlayer    = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - trSizeLog2;
 
         bool  bCodeChroma   = true;
+        bool bChromaSame = false;
         uint32_t  trModeC     = trMode;
         if ((trSizeLog2 == 2) && !(chFmt == CHROMA_444))
         {
@@ -4162,6 +4160,7 @@
             trModeC--;
             uint32_t qpdiv = cu->getPic()->getNumPartInCU() >> ((cu->getDepth(0) + trModeC) << 1);
             bCodeChroma  = ((absPartIdx % qpdiv) == 0);
+            bChromaSame = true;
         }
 
         if (bSpatial)
@@ -4172,7 +4171,7 @@
 
             if (bCodeChroma)
             {
-                m_qtTempShortYuv[qtlayer].copyPartToPartChroma(resiYuv, absPartIdx, 1 << trSizeCLog2, 1 << trSizeCLog2);
+                m_qtTempShortYuv[qtlayer].copyPartToPartChroma(resiYuv, absPartIdx, 1 << trSizeLog2, bChromaSame);
             }
         }
         else
diff -r 50d7910ddd61 -r 4c8c208d66f2 source/common/shortyuv.cpp
--- a/source/common/shortyuv.cpp	Sun Mar 09 22:10:39 2014 -0500
+++ b/source/common/shortyuv.cpp	Mon Mar 10 16:59:22 2014 +0530
@@ -115,50 +115,34 @@
     primitives.pixeladd_ss(cpartSize, cpartSize, getCrAddr(trUnitIdx, cpartSize), m_cwidth, srcV0, srcV1, srcYuv0->m_cwidth, srcYuv1->m_cwidth);
 }
 
-void ShortYuv::copyPartToPartYuv(ShortYuv* dstPicYuv, uint32_t partIdx, uint32_t width, uint32_t height)
-{
-    copyPartToPartLuma(dstPicYuv, partIdx, width, height);
-    copyPartToPartChroma(dstPicYuv, partIdx, width >> m_hChromaShift, height >> m_vChromaShift);
-}
-
-void ShortYuv::copyPartToPartYuv(TComYuv* dstPicYuv, uint32_t partIdx, uint32_t width, uint32_t height)
-{
-    copyPartToPartLuma(dstPicYuv, partIdx, width, height);
-    copyPartToPartChroma(dstPicYuv, partIdx, width >> m_hChromaShift, height >> m_vChromaShift);
-}
-
 void ShortYuv::copyPartToPartLuma(ShortYuv* dstPicYuv, uint32_t partIdx, uint32_t width, uint32_t height)
 {
+    int part = partitionFromSizes(width, height);
     int16_t* src = getLumaAddr(partIdx);
     int16_t* dst = dstPicYuv->getLumaAddr(partIdx);
 
     uint32_t srcStride = m_width;
     uint32_t dstStride = dstPicYuv->m_width;
-#if HIGH_BIT_DEPTH
-    primitives.blockcpy_pp(width, height, (pixel*)dst, dstStride, (pixel*)src, srcStride);
-#else
-    for (uint32_t y = height; y != 0; y--)
-    {
-        ::memcpy(dst, src, width * sizeof(int16_t));
-        src += srcStride;
-        dst += dstStride;
-    }
-#endif
+
+    primitives.luma_copy_ss[part](dst, dstStride, src, srcStride);
+
 }
 
 void ShortYuv::copyPartToPartLuma(TComYuv* dstPicYuv, uint32_t partIdx, uint32_t width, uint32_t height)
 {
+    int part = partitionFromSizes(width, height);
     int16_t* src = getLumaAddr(partIdx);
     pixel* dst = dstPicYuv->getLumaAddr(partIdx);
 
     uint32_t srcStride = m_width;
     uint32_t dstStride = dstPicYuv->getStride();
 
-    primitives.blockcpy_ps(width, height, dst, dstStride, src, srcStride);
+    primitives.luma_copy_sp[part](dst, dstStride, src, srcStride);
 }
 
-void ShortYuv::copyPartToPartChroma(ShortYuv* dstPicYuv, uint32_t partIdx, uint32_t width, uint32_t height)
+void ShortYuv::copyPartToPartChroma(ShortYuv* dstPicYuv, uint32_t partIdx, uint32_t lumaSize, bool bChromaSame)
 {
+    int part = partitionFromSizes(lumaSize, lumaSize);
     int16_t* srcU = getCbAddr(partIdx);
     int16_t* srcV = getCrAddr(partIdx);
     int16_t* dstU = dstPicYuv->getCbAddr(partIdx);
@@ -166,24 +150,22 @@
 
     uint32_t srcStride = m_cwidth;
     uint32_t dstStride = dstPicYuv->m_cwidth;
-#if HIGH_BIT_DEPTH
-    primitives.blockcpy_pp(width, height, (pixel*)dstU, dstStride, (pixel*)srcU, srcStride);
-    primitives.blockcpy_pp(width, height, (pixel*)dstV, dstStride, (pixel*)srcV, srcStride);
-#else
-    for (uint32_t y = height; y != 0; y--)
+
+    if (bChromaSame)
     {
-        ::memcpy(dstU, srcU, width * sizeof(int16_t));
-        ::memcpy(dstV, srcV, width * sizeof(int16_t));
-        srcU += srcStride;
-        srcV += srcStride;
-        dstU += dstStride;
-        dstV += dstStride;
+        primitives.luma_copy_ss[part](dstU, dstStride, srcU, srcStride);
+        primitives.luma_copy_ss[part](dstV, dstStride, srcV, srcStride);
     }
-#endif
+    else
+    {
+        primitives.chroma[m_csp].copy_ss[part](dstU, dstStride, srcU, srcStride);
+        primitives.chroma[m_csp].copy_ss[part](dstV, dstStride, srcV, srcStride);
+    }
 }
 
-void ShortYuv::copyPartToPartChroma(TComYuv* dstPicYuv, uint32_t partIdx, uint32_t width, uint32_t height)
+void ShortYuv::copyPartToPartChroma(TComYuv* dstPicYuv, uint32_t partIdx, uint32_t lumaSize, bool bChromaSame)
 {
+    int part = partitionFromSizes(lumaSize, lumaSize);
     int16_t* srcU = getCbAddr(partIdx);
     int16_t* srcV = getCrAddr(partIdx);
     pixel* dstU = dstPicYuv->getCbAddr(partIdx);
@@ -192,24 +174,28 @@
     uint32_t srcStride = m_cwidth;
     uint32_t dstStride = dstPicYuv->getCStride();
 
-    primitives.blockcpy_ps(width, height, dstU, dstStride, srcU, srcStride);
-    primitives.blockcpy_ps(width, height, dstV, dstStride, srcV, srcStride);
+    if (bChromaSame)
+    {
+        primitives.luma_copy_sp[part](dstU, dstStride, srcU, srcStride);
+        primitives.luma_copy_sp[part](dstV, dstStride, srcV, srcStride);
+    }
+    else
+    {
+        primitives.chroma[m_csp].copy_sp[part](dstU, dstStride, srcU, srcStride);
+        primitives.chroma[m_csp].copy_sp[part](dstV, dstStride, srcV, srcStride);
+    }
 }
 
-void ShortYuv::copyPartToPartChroma(ShortYuv* dstPicYuv, uint32_t partIdx, uint32_t width, uint32_t height, uint32_t chromaId)
+void ShortYuv::copyPartToPartShortChroma(ShortYuv* dstPicYuv, uint32_t partIdx, uint32_t lumaSize, uint32_t chromaId)
 {
+    int part = partitionFromSizes(lumaSize, lumaSize);
     if (chromaId == 0)
     {
         int16_t* srcU = getCbAddr(partIdx);
         int16_t* dstU = dstPicYuv->getCbAddr(partIdx);
         uint32_t srcStride = m_cwidth;
         uint32_t dstStride = dstPicYuv->m_cwidth;
-        for (uint32_t y = height; y != 0; y--)
-        {
-            ::memcpy(dstU, srcU, width * sizeof(int16_t));
-            srcU += srcStride;
-            dstU += dstStride;
-        }
+        primitives.chroma[m_csp].copy_ss[part](dstU, dstStride, srcU, srcStride);
     }
     else if (chromaId == 1)
     {
@@ -217,12 +203,7 @@
         int16_t* dstV = dstPicYuv->getCrAddr(partIdx);
         uint32_t srcStride = m_cwidth;
         uint32_t dstStride = dstPicYuv->m_cwidth;
-        for (uint32_t y = height; y != 0; y--)
-        {
-            ::memcpy(dstV, srcV, width * sizeof(int16_t));
-            srcV += srcStride;
-            dstV += dstStride;
-        }
+        primitives.chroma[m_csp].copy_ss[part](dstV, dstStride, srcV, srcStride);
     }
     else
     {
@@ -232,27 +213,21 @@
         int16_t* dstV = dstPicYuv->getCrAddr(partIdx);
         uint32_t srcStride = m_cwidth;
         uint32_t dstStride = dstPicYuv->m_cwidth;
-        for (uint32_t y = height; y != 0; y--)
-        {
-            ::memcpy(dstU, srcU, width * sizeof(int16_t));
-            ::memcpy(dstV, srcV, width * sizeof(int16_t));
-            srcU += srcStride;
-            srcV += srcStride;
-            dstU += dstStride;
-            dstV += dstStride;
-        }
+        primitives.chroma[m_csp].copy_ss[part](dstU, dstStride, srcU, srcStride);
+        primitives.chroma[m_csp].copy_ss[part](dstV, dstStride, srcV, srcStride);
     }
 }
 
-void ShortYuv::copyPartToPartChroma(TComYuv* dstPicYuv, uint32_t partIdx, uint32_t width, uint32_t height, uint32_t chromaId)
+void ShortYuv::copyPartToPartYuvChroma(TComYuv* dstPicYuv, uint32_t partIdx, uint32_t lumaSize, uint32_t chromaId)
 {
+    int part = partitionFromSizes(lumaSize, lumaSize);
     if (chromaId == 0)
     {
         int16_t* srcU = getCbAddr(partIdx);
         pixel* dstU = dstPicYuv->getCbAddr(partIdx);
         uint32_t srcStride = m_cwidth;
         uint32_t dstStride = dstPicYuv->getCStride();
-        primitives.blockcpy_ps(width, height, dstU, dstStride, srcU, srcStride);
+        primitives.chroma[m_csp].copy_sp[part](dstU, dstStride, srcU, srcStride);
     }
     else if (chromaId == 1)
     {
@@ -260,7 +235,7 @@
         pixel* dstV = dstPicYuv->getCrAddr(partIdx);
         uint32_t srcStride = m_cwidth;
         uint32_t dstStride = dstPicYuv->getCStride();
-        primitives.blockcpy_ps(width, height, dstV, dstStride, srcV, srcStride);
+        primitives.chroma[m_csp].copy_sp[part](dstV, dstStride, srcV, srcStride);
     }
     else
     {
@@ -271,7 +246,7 @@
 
         uint32_t srcStride = m_cwidth;
         uint32_t dstStride = dstPicYuv->getCStride();
-        primitives.blockcpy_ps(width, height, dstU, dstStride, srcU, srcStride);
-        primitives.blockcpy_ps(width, height, dstV, dstStride, srcV, srcStride);
+        primitives.chroma[m_csp].copy_sp[part](dstU, dstStride, srcU, srcStride);
+        primitives.chroma[m_csp].copy_sp[part](dstV, dstStride, srcV, srcStride);
     }
 }
diff -r 50d7910ddd61 -r 4c8c208d66f2 source/common/shortyuv.h
--- a/source/common/shortyuv.h	Sun Mar 09 22:10:39 2014 -0500
+++ b/source/common/shortyuv.h	Mon Mar 10 16:59:22 2014 +0530
@@ -97,15 +97,13 @@
     void subtract(TComYuv* srcYuv0, TComYuv* srcYuv1, uint32_t trUnitIdx, uint32_t partSize);
     void addClip(ShortYuv* srcYuv0, ShortYuv* srcYuv1, uint32_t trUnitIdx, uint32_t partSize);
 
-    void copyPartToPartYuv(ShortYuv* dstPicYuv, uint32_t partIdx, uint32_t width, uint32_t height);
     void copyPartToPartLuma(ShortYuv* dstPicYuv, uint32_t partIdx, uint32_t width, uint32_t height);
-    void copyPartToPartChroma(ShortYuv* dstPicYuv, uint32_t partIdx, uint32_t width, uint32_t height);
-    void copyPartToPartChroma(ShortYuv* dstPicYuv, uint32_t partIdx, uint32_t width, uint32_t height, uint32_t chromaId);
+    void copyPartToPartChroma(ShortYuv* dstPicYuv, uint32_t partIdx, uint32_t lumaSize, bool bChromaSame);
+    void copyPartToPartShortChroma(ShortYuv* dstPicYuv, uint32_t partIdx, uint32_t lumaSize, uint32_t chromaId);
 
-    void copyPartToPartYuv(TComYuv* dstPicYuv, uint32_t partIdx, uint32_t width, uint32_t height);
     void copyPartToPartLuma(TComYuv* dstPicYuv, uint32_t partIdx, uint32_t width, uint32_t height);
-    void copyPartToPartChroma(TComYuv* dstPicYuv, uint32_t partIdx, uint32_t width, uint32_t height);
-    void copyPartToPartChroma(TComYuv* dstPicYuv, uint32_t partIdx, uint32_t width, uint32_t height, uint32_t chromaId);
+    void copyPartToPartChroma(TComYuv* dstPicYuv, uint32_t partIdx, uint32_t lumaSize, bool bChromaSame);
+    void copyPartToPartYuvChroma(TComYuv* dstPicYuv, uint32_t partIdx, uint32_t lumaSize, uint32_t chromaId);
 };
 }
 


More information about the x265-devel mailing list