[x265] [PATCH] cudata: change cudata copy functions into template functions to avoid useless memory copy

Ximing Cheng chengximing1989 at foxmail.com
Tue Sep 6 19:22:44 CEST 2016


# HG changeset patch
# User Ximing Cheng <ximingcheng at tencent.com>
# Date 1473182485 -28800
#      Wed Sep 07 01:21:25 2016 +0800
# Node ID b24cf6bc3795f06d53cd9d614b38f021d0e55a2f
# Parent  df559450949bd085b0fc5e01332aa8458af2fa43
cudata: change cudata copy functions into template functions to avoid useless memory copy

diff -r df559450949b -r b24cf6bc3795 source/common/common.h
--- a/source/common/common.h	Wed Aug 10 13:26:18 2016 +0530
+++ b/source/common/common.h	Wed Sep 07 01:21:25 2016 +0800
@@ -246,6 +246,7 @@
 #define X265_LOG2(x)  log2(x)
 #endif
 
+#define MAX_SLICE_TYPES         3                           // maximum number of slice types
 #define NUM_CU_DEPTH            4                           // maximum number of CU depths
 #define NUM_FULL_DEPTH          5                           // maximum number of full depths
 #define MIN_LOG2_CU_SIZE        3                           // log2(minCUSize)
diff -r df559450949b -r b24cf6bc3795 source/common/cudata.cpp
--- a/source/common/cudata.cpp	Wed Aug 10 13:26:18 2016 +0530
+++ b/source/common/cudata.cpp	Wed Sep 07 01:21:25 2016 +0800
@@ -28,6 +28,9 @@
 #include "picyuv.h"
 #include "mv.h"
 #include "cudata.h"
+#if defined(_MSC_VER)
+#pragma warning(disable: 4127) // conditional expression is constant
+#endif
 
 using namespace X265_NS;
 
@@ -113,6 +116,19 @@
 cubcast_t CUData::s_partSet[NUM_FULL_DEPTH] = { NULL, NULL, NULL, NULL, NULL };
 uint32_t CUData::s_numPartInCUSize;
 
+#define CUDATA_SLICETYPE_FUNCS(funcName) \
+    CUData:: ## funcName ## _t CUData:: ## funcName ## _func[MAX_SLICE_TYPES] = \
+    { &CUData:: ## funcName ## <B_SLICE>, &CUData:: ## funcName ## <P_SLICE>, &CUData:: ## funcName ## <I_SLICE> };
+
+CUDATA_SLICETYPE_FUNCS(initCTU)
+CUDATA_SLICETYPE_FUNCS(initSubCU)
+CUDATA_SLICETYPE_FUNCS(initLosslessCU)
+CUDATA_SLICETYPE_FUNCS(copyPartFrom)
+CUDATA_SLICETYPE_FUNCS(copyToPic)
+CUDATA_SLICETYPE_FUNCS(copyFromPic)
+
+#undef CUDATA_SLICETYPE_FUNCS
+
 CUData::CUData()
 {
     memset(this, 0, sizeof(*this));
@@ -266,6 +282,7 @@
     }
 }
 
+template <SliceType type>
 void CUData::initCTU(const Frame& frame, uint32_t cuAddr, int qp)
 {
     m_encData       = frame.m_encData;
@@ -282,10 +299,11 @@
     m_partSet(m_lumaIntraDir, (uint8_t)ALL_IDX);
     m_partSet(m_chromaIntraDir, (uint8_t)ALL_IDX);
     m_partSet(m_tqBypass,     (uint8_t)frame.m_encData->m_param->bLossless);
-    if (m_slice->m_sliceType != I_SLICE)
+    if (type != I_SLICE)
     {
         m_partSet((uint8_t*)m_refIdx[0], (uint8_t)REF_NOT_VALID);
-        m_partSet((uint8_t*)m_refIdx[1], (uint8_t)REF_NOT_VALID);
+        if (type == B_SLICE)
+            m_partSet((uint8_t*)m_refIdx[1], (uint8_t)REF_NOT_VALID);
     }
 
     X265_CHECK(!(frame.m_encData->m_param->bLossless && !m_slice->m_pps->bTransquantBypassEnabled), "lossless enabled without TQbypass in PPS\n");
@@ -301,6 +319,7 @@
 }
 
 // initialize Sub partition
+template <SliceType type>
 void CUData::initSubCU(const CUData& ctu, const CUGeom& cuGeom, int qp)
 {
     m_absIdxInCTU   = cuGeom.absPartIdx;
@@ -321,8 +340,12 @@
     m_partSet(m_lumaIntraDir, (uint8_t)ALL_IDX);
     m_partSet(m_chromaIntraDir, (uint8_t)ALL_IDX);
     m_partSet(m_tqBypass,     (uint8_t)m_encData->m_param->bLossless);
-    m_partSet((uint8_t*)m_refIdx[0], (uint8_t)REF_NOT_VALID);
-    m_partSet((uint8_t*)m_refIdx[1], (uint8_t)REF_NOT_VALID);
+    if (type != I_SLICE)
+    {
+        m_partSet((uint8_t*)m_refIdx[0], (uint8_t)REF_NOT_VALID);
+        if (type == B_SLICE)
+            m_partSet((uint8_t*)m_refIdx[1], (uint8_t)REF_NOT_VALID);
+    }
     m_partSet(m_cuDepth,      (uint8_t)cuGeom.depth);
 
     /* initialize the remaining CU data in one memset */
@@ -330,6 +353,7 @@
 }
 
 /* Copy the results of a sub-part (split) CU to the parent CU */
+template <SliceType type>
 void CUData::copyPartFrom(const CUData& subCU, const CUGeom& childGeom, uint32_t subPartIdx)
 {
     X265_CHECK(subPartIdx < 4, "part unit should be less than 4\n");
@@ -340,24 +364,37 @@
     m_subPartCopy(m_log2CUSize + offset, subCU.m_log2CUSize);
     m_subPartCopy(m_lumaIntraDir + offset, subCU.m_lumaIntraDir);
     m_subPartCopy(m_tqBypass + offset, subCU.m_tqBypass);
-    m_subPartCopy((uint8_t*)m_refIdx[0] + offset, (uint8_t*)subCU.m_refIdx[0]);
-    m_subPartCopy((uint8_t*)m_refIdx[1] + offset, (uint8_t*)subCU.m_refIdx[1]);
+    if (type != I_SLICE)
+    {
+        m_subPartCopy((uint8_t*)m_refIdx[0] + offset, (uint8_t*)subCU.m_refIdx[0]);
+        if (type == B_SLICE)
+            m_subPartCopy((uint8_t*)m_refIdx[1] + offset, (uint8_t*)subCU.m_refIdx[1]);
+    }
     m_subPartCopy(m_cuDepth + offset, subCU.m_cuDepth);
     m_subPartCopy(m_predMode + offset, subCU.m_predMode);
     m_subPartCopy(m_partSize + offset, subCU.m_partSize);
-    m_subPartCopy(m_mergeFlag + offset, subCU.m_mergeFlag);
-    m_subPartCopy(m_interDir + offset, subCU.m_interDir);
-    m_subPartCopy(m_mvpIdx[0] + offset, subCU.m_mvpIdx[0]);
-    m_subPartCopy(m_mvpIdx[1] + offset, subCU.m_mvpIdx[1]);
+    if (type != I_SLICE)
+    {
+        m_subPartCopy(m_mergeFlag + offset, subCU.m_mergeFlag);
+        m_subPartCopy(m_interDir + offset, subCU.m_interDir);
+        m_subPartCopy(m_mvpIdx[0] + offset, subCU.m_mvpIdx[0]);
+        if (type == B_SLICE)
+            m_subPartCopy(m_mvpIdx[1] + offset, subCU.m_mvpIdx[1]);
+    }
     m_subPartCopy(m_tuDepth + offset, subCU.m_tuDepth);
 
     m_subPartCopy(m_transformSkip[0] + offset, subCU.m_transformSkip[0]);
     m_subPartCopy(m_cbf[0] + offset, subCU.m_cbf[0]);
 
-    memcpy(m_mv[0] + offset, subCU.m_mv[0], childGeom.numPartitions * sizeof(MV));
-    memcpy(m_mv[1] + offset, subCU.m_mv[1], childGeom.numPartitions * sizeof(MV));
-    memcpy(m_mvd[0] + offset, subCU.m_mvd[0], childGeom.numPartitions * sizeof(MV));
-    memcpy(m_mvd[1] + offset, subCU.m_mvd[1], childGeom.numPartitions * sizeof(MV));
+    if (type != I_SLICE)
+    {
+        memcpy(m_mv[0] + offset, subCU.m_mv[0], childGeom.numPartitions * sizeof(MV));
+        if (type == B_SLICE)
+            memcpy(m_mv[1] + offset, subCU.m_mv[1], childGeom.numPartitions * sizeof(MV));
+        memcpy(m_mvd[0] + offset, subCU.m_mvd[0], childGeom.numPartitions * sizeof(MV));
+        if (type == B_SLICE)
+            memcpy(m_mvd[1] + offset, subCU.m_mvd[1], childGeom.numPartitions * sizeof(MV));
+    }
 
     uint32_t tmp = 1 << ((g_maxLog2CUSize - childGeom.depth) * 2);
     uint32_t tmp2 = subPartIdx * tmp;
@@ -389,6 +426,7 @@
 
 /* Copy all CU data from one instance to the next, except set lossless flag
  * This will only get used when --cu-lossless is enabled but --lossless is not. */
+template <SliceType type>
 void CUData::initLosslessCU(const CUData& cu, const CUGeom& cuGeom)
 {
     /* Start by making an exact copy */
@@ -404,10 +442,15 @@
     m_absIdxInCTU  = cuGeom.absPartIdx;
     m_numPartitions = cuGeom.numPartitions;
     memcpy(m_qp, cu.m_qp, BytesPerPartition * m_numPartitions);
-    memcpy(m_mv[0],  cu.m_mv[0],  m_numPartitions * sizeof(MV));
-    memcpy(m_mv[1],  cu.m_mv[1],  m_numPartitions * sizeof(MV));
-    memcpy(m_mvd[0], cu.m_mvd[0], m_numPartitions * sizeof(MV));
-    memcpy(m_mvd[1], cu.m_mvd[1], m_numPartitions * sizeof(MV));
+    if (type != I_SLICE)
+    {
+        memcpy(m_mv[0], cu.m_mv[0], m_numPartitions * sizeof(MV));
+        if (type == B_SLICE)
+            memcpy(m_mv[1], cu.m_mv[1], m_numPartitions * sizeof(MV));
+        memcpy(m_mvd[0], cu.m_mvd[0], m_numPartitions * sizeof(MV));
+        if (type == B_SLICE)
+            memcpy(m_mvd[1], cu.m_mvd[1], m_numPartitions * sizeof(MV));
+    }
 
     /* force TQBypass to true */
     m_partSet(m_tqBypass, true);
@@ -429,6 +472,7 @@
 }
 
 /* Copy completed predicted CU to CTU in picture */
+template <SliceType type>
 void CUData::copyToPic(uint32_t depth) const
 {
     CUData& ctu = *m_encData->getPicCTU(m_cuAddr);
@@ -437,23 +481,36 @@
     m_partCopy(ctu.m_log2CUSize + m_absIdxInCTU, m_log2CUSize);
     m_partCopy(ctu.m_lumaIntraDir + m_absIdxInCTU, m_lumaIntraDir);
     m_partCopy(ctu.m_tqBypass + m_absIdxInCTU, m_tqBypass);
-    m_partCopy((uint8_t*)ctu.m_refIdx[0] + m_absIdxInCTU, (uint8_t*)m_refIdx[0]);
-    m_partCopy((uint8_t*)ctu.m_refIdx[1] + m_absIdxInCTU, (uint8_t*)m_refIdx[1]);
+    if (type != I_SLICE)
+    {
+        m_partCopy((uint8_t*)ctu.m_refIdx[0] + m_absIdxInCTU, (uint8_t*)m_refIdx[0]);
+        if (type == B_SLICE)
+            m_partCopy((uint8_t*)ctu.m_refIdx[1] + m_absIdxInCTU, (uint8_t*)m_refIdx[1]);
+    }
     m_partCopy(ctu.m_cuDepth + m_absIdxInCTU, m_cuDepth);
     m_partCopy(ctu.m_predMode + m_absIdxInCTU, m_predMode);
     m_partCopy(ctu.m_partSize + m_absIdxInCTU, m_partSize);
-    m_partCopy(ctu.m_mergeFlag + m_absIdxInCTU, m_mergeFlag);
-    m_partCopy(ctu.m_interDir + m_absIdxInCTU, m_interDir);
-    m_partCopy(ctu.m_mvpIdx[0] + m_absIdxInCTU, m_mvpIdx[0]);
-    m_partCopy(ctu.m_mvpIdx[1] + m_absIdxInCTU, m_mvpIdx[1]);
+    if (type != I_SLICE)
+    {
+        m_partCopy(ctu.m_mergeFlag + m_absIdxInCTU, m_mergeFlag);
+        m_partCopy(ctu.m_interDir + m_absIdxInCTU, m_interDir);
+        m_partCopy(ctu.m_mvpIdx[0] + m_absIdxInCTU, m_mvpIdx[0]);
+        if (type == B_SLICE)
+            m_partCopy(ctu.m_mvpIdx[1] + m_absIdxInCTU, m_mvpIdx[1]);
+    }
     m_partCopy(ctu.m_tuDepth + m_absIdxInCTU, m_tuDepth);
     m_partCopy(ctu.m_transformSkip[0] + m_absIdxInCTU, m_transformSkip[0]);
     m_partCopy(ctu.m_cbf[0] + m_absIdxInCTU, m_cbf[0]);
 
-    memcpy(ctu.m_mv[0] + m_absIdxInCTU, m_mv[0], m_numPartitions * sizeof(MV));
-    memcpy(ctu.m_mv[1] + m_absIdxInCTU, m_mv[1], m_numPartitions * sizeof(MV));
-    memcpy(ctu.m_mvd[0] + m_absIdxInCTU, m_mvd[0], m_numPartitions * sizeof(MV));
-    memcpy(ctu.m_mvd[1] + m_absIdxInCTU, m_mvd[1], m_numPartitions * sizeof(MV));
+    if (type != I_SLICE)
+    {
+        memcpy(ctu.m_mv[0] + m_absIdxInCTU, m_mv[0], m_numPartitions * sizeof(MV));
+        if (type == B_SLICE)
+            memcpy(ctu.m_mv[1] + m_absIdxInCTU, m_mv[1], m_numPartitions * sizeof(MV));
+        memcpy(ctu.m_mvd[0] + m_absIdxInCTU, m_mvd[0], m_numPartitions * sizeof(MV));
+        if (type == B_SLICE)
+            memcpy(ctu.m_mvd[1] + m_absIdxInCTU, m_mvd[1], m_numPartitions * sizeof(MV));
+    }
 
     uint32_t tmpY = 1 << ((g_maxLog2CUSize - depth) * 2);
     uint32_t tmpY2 = m_absIdxInCTU << (LOG2_UNIT_SIZE * 2);
@@ -475,6 +532,7 @@
 }
 
 /* The reverse of copyToPic, called only by encodeResidue */
+template <SliceType type>
 void CUData::copyFromPic(const CUData& ctu, const CUGeom& cuGeom, int csp, bool copyQp)
 {
     m_encData       = ctu.m_encData;
@@ -491,21 +549,34 @@
     m_partCopy(m_log2CUSize,   ctu.m_log2CUSize + m_absIdxInCTU);
     m_partCopy(m_lumaIntraDir, ctu.m_lumaIntraDir + m_absIdxInCTU);
     m_partCopy(m_tqBypass,     ctu.m_tqBypass + m_absIdxInCTU);
-    m_partCopy((uint8_t*)m_refIdx[0], (uint8_t*)ctu.m_refIdx[0] + m_absIdxInCTU);
-    m_partCopy((uint8_t*)m_refIdx[1], (uint8_t*)ctu.m_refIdx[1] + m_absIdxInCTU);
+    if (type != I_SLICE)
+    {
+        m_partCopy((uint8_t*)m_refIdx[0], (uint8_t*)ctu.m_refIdx[0] + m_absIdxInCTU);
+        if (type == B_SLICE)
+            m_partCopy((uint8_t*)m_refIdx[1], (uint8_t*)ctu.m_refIdx[1] + m_absIdxInCTU);
+    }
     m_partCopy(m_cuDepth,      ctu.m_cuDepth + m_absIdxInCTU);
     m_partSet(m_predMode, ctu.m_predMode[m_absIdxInCTU] & (MODE_INTRA | MODE_INTER)); /* clear skip flag */
     m_partCopy(m_partSize,     ctu.m_partSize + m_absIdxInCTU);
-    m_partCopy(m_mergeFlag,    ctu.m_mergeFlag + m_absIdxInCTU);
-    m_partCopy(m_interDir,     ctu.m_interDir + m_absIdxInCTU);
-    m_partCopy(m_mvpIdx[0],    ctu.m_mvpIdx[0] + m_absIdxInCTU);
-    m_partCopy(m_mvpIdx[1],    ctu.m_mvpIdx[1] + m_absIdxInCTU);
+    if (type != I_SLICE)
+    {
+        m_partCopy(m_mergeFlag, ctu.m_mergeFlag + m_absIdxInCTU);
+        m_partCopy(m_interDir, ctu.m_interDir + m_absIdxInCTU);
+        m_partCopy(m_mvpIdx[0], ctu.m_mvpIdx[0] + m_absIdxInCTU);
+        if (type == B_SLICE)
+            m_partCopy(m_mvpIdx[1], ctu.m_mvpIdx[1] + m_absIdxInCTU);
+    }
     m_partCopy(m_chromaIntraDir, ctu.m_chromaIntraDir + m_absIdxInCTU);
 
-    memcpy(m_mv[0], ctu.m_mv[0] + m_absIdxInCTU, m_numPartitions * sizeof(MV));
-    memcpy(m_mv[1], ctu.m_mv[1] + m_absIdxInCTU, m_numPartitions * sizeof(MV));
-    memcpy(m_mvd[0], ctu.m_mvd[0] + m_absIdxInCTU, m_numPartitions * sizeof(MV));
-    memcpy(m_mvd[1], ctu.m_mvd[1] + m_absIdxInCTU, m_numPartitions * sizeof(MV));
+    if (type != I_SLICE)
+    {
+        memcpy(m_mv[0], ctu.m_mv[0] + m_absIdxInCTU, m_numPartitions * sizeof(MV));
+        if (type == B_SLICE)
+            memcpy(m_mv[1], ctu.m_mv[1] + m_absIdxInCTU, m_numPartitions * sizeof(MV));
+        memcpy(m_mvd[0], ctu.m_mvd[0] + m_absIdxInCTU, m_numPartitions * sizeof(MV));
+        if (type == B_SLICE)
+            memcpy(m_mvd[1], ctu.m_mvd[1] + m_absIdxInCTU, m_numPartitions * sizeof(MV));
+    }
 
     /* clear residual coding flags */
     m_partSet(m_tuDepth, 0);
diff -r df559450949b -r b24cf6bc3795 source/common/cudata.h
--- a/source/common/cudata.h	Wed Aug 10 13:26:18 2016 +0530
+++ b/source/common/cudata.h	Wed Sep 07 01:21:25 2016 +0800
@@ -154,6 +154,9 @@
     { 0x00, 0x05, 0x05, 0x05 }  // SIZE_nRx2N.
 };
 
+#define CUDATA_FUNC(caller, funcName, type, ...) \
+    ((caller)->*CUData:: ## funcName ## _func[type])(__VA_ARGS__)
+
 // Holds part data for a CU of a given size, from an 8x8 CU to a CTU
 class CUData
 {
@@ -214,18 +217,33 @@
     void     initialize(const CUDataMemPool& dataPool, uint32_t depth, int csp, int instance);
     static void calcCTUGeoms(uint32_t ctuWidth, uint32_t ctuHeight, uint32_t maxCUSize, uint32_t minCUSize, CUGeom cuDataArray[CUGeom::MAX_GEOMS]);
 
-    void     initCTU(const Frame& frame, uint32_t cuAddr, int qp);
-    void     initSubCU(const CUData& ctu, const CUGeom& cuGeom, int qp);
-    void     initLosslessCU(const CUData& cu, const CUGeom& cuGeom);
+#define DECLARE_SLICETYPE_TEMPLATE_FUNC(funcName, ...) \
+    typedef void (CUData::* ## funcName ## _t)(__VA_ARGS__); \
+    static funcName ## _t funcName ## _func[MAX_SLICE_TYPES]; \
+    template <SliceType type> \
+    void funcName(__VA_ARGS__);
 
-    void     copyPartFrom(const CUData& cu, const CUGeom& childGeom, uint32_t subPartIdx);
+#define DECLARE_SLICETYPE_TEMPLATE_CONST_FUNC(funcName, ...) \
+    typedef void (CUData::* ## funcName ## _t)(__VA_ARGS__) const; \
+    static funcName ## _t funcName ## _func[MAX_SLICE_TYPES]; \
+    template <SliceType type> \
+    void funcName(__VA_ARGS__) const;
+
+    DECLARE_SLICETYPE_TEMPLATE_FUNC(initCTU, const Frame& frame, uint32_t cuAddr, int qp);
+    DECLARE_SLICETYPE_TEMPLATE_FUNC(initSubCU, const CUData& ctu, const CUGeom& cuGeom, int qp);
+    DECLARE_SLICETYPE_TEMPLATE_FUNC(initLosslessCU, const CUData& cu, const CUGeom& cuGeom);
+
+    DECLARE_SLICETYPE_TEMPLATE_FUNC(copyPartFrom, const CUData& cu, const CUGeom& childGeom, uint32_t subPartIdx);
     void     setEmptyPart(const CUGeom& childGeom, uint32_t subPartIdx);
-    void     copyToPic(uint32_t depth) const;
+    DECLARE_SLICETYPE_TEMPLATE_CONST_FUNC(copyToPic, uint32_t depth);
 
     /* RD-0 methods called only from encodeResidue */
-    void     copyFromPic(const CUData& ctu, const CUGeom& cuGeom, int csp, bool copyQp = true);
+    DECLARE_SLICETYPE_TEMPLATE_FUNC(copyFromPic, const CUData& ctu, const CUGeom& cuGeom, int csp, bool copyQp);
     void     updatePic(uint32_t depth, int picCsp) const;
 
+#undef DECLARE_SLICETYPE_TEMPLATE_CONST_FUNC
+#undef DECLARE_SLICETYPE_TEMPLATE_FUNC
+
     void     setPartSizeSubParts(PartSize size)    { m_partSet(m_partSize, (uint8_t)size); }
     void     setPredModeSubParts(PredMode mode)    { m_partSet(m_predMode, (uint8_t)mode); }
     void     clearCbf()                            { m_partSet(m_cbf[0], 0); if (m_chromaFormat != X265_CSP_I400) { m_partSet(m_cbf[1], 0); m_partSet(m_cbf[2], 0);} }
diff -r df559450949b -r b24cf6bc3795 source/encoder/analysis.cpp
--- a/source/encoder/analysis.cpp	Wed Aug 10 13:26:18 2016 +0530
+++ b/source/encoder/analysis.cpp	Wed Sep 07 01:21:25 2016 +0800
@@ -70,6 +70,9 @@
  * rd-level 5,6 does RDO for each inter mode
  */
 
+#define CALL_CUDATA_FUNC(caller, funcName, ...) \
+    CUDATA_FUNC(caller, funcName, m_slice->m_sliceType, __VA_ARGS__)
+
 Analysis::Analysis()
 {
     m_reuseInterDataCTU = NULL;
@@ -213,7 +216,7 @@
     else if (md.bestMode->cu.isIntra(0))
     {
         md.pred[PRED_LOSSLESS].initCosts();
-        md.pred[PRED_LOSSLESS].cu.initLosslessCU(md.bestMode->cu, cuGeom);
+        CALL_CUDATA_FUNC(&md.pred[PRED_LOSSLESS].cu, initLosslessCU, md.bestMode->cu, cuGeom);
         PartSize size = (PartSize)md.pred[PRED_LOSSLESS].cu.m_partSize[0];
         checkIntra(md.pred[PRED_LOSSLESS], cuGeom, size);
         checkBestMode(md.pred[PRED_LOSSLESS], cuGeom.depth);
@@ -221,7 +224,7 @@
     else
     {
         md.pred[PRED_LOSSLESS].initCosts();
-        md.pred[PRED_LOSSLESS].cu.initLosslessCU(md.bestMode->cu, cuGeom);
+        CALL_CUDATA_FUNC(&md.pred[PRED_LOSSLESS].cu, initLosslessCU, md.bestMode->cu, cuGeom);
         md.pred[PRED_LOSSLESS].predYuv.copyFromYuv(md.bestMode->predYuv);
         encodeResAndCalcRdInterCU(md.pred[PRED_LOSSLESS], cuGeom);
         checkBestMode(md.pred[PRED_LOSSLESS], cuGeom.depth);
@@ -279,7 +282,7 @@
     recodeCU(parentCTU, cuGeom, bestCUQP, lambdaQP);
 
     /* Copy best data to encData CTU and recon */
-    md.bestMode->cu.copyToPic(depth);
+    CALL_CUDATA_FUNC(&md.bestMode->cu, copyToPic, depth);
     md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.absPartIdx);
 }
 
@@ -301,7 +304,7 @@
         {
             Mode& mode = md.pred[0];
             md.bestMode = &mode;
-            mode.cu.initSubCU(parentCTU, cuGeom, qp);
+            mode.cu.initSubCU<I_SLICE>(parentCTU, cuGeom, qp);
             memcpy(mode.cu.m_lumaIntraDir, parentCTU.m_lumaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions);
             memcpy(mode.cu.m_chromaIntraDir, parentCTU.m_chromaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions);
             checkIntra(mode, cuGeom, (PartSize)parentCTU.m_partSize[cuGeom.absPartIdx]);
@@ -315,13 +318,13 @@
     }
     else if (cuGeom.log2CUSize != MAX_LOG2_CU_SIZE && mightNotSplit)
     {
-        md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);
+        md.pred[PRED_INTRA].cu.initSubCU<I_SLICE>(parentCTU, cuGeom, qp);
         checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N);
         checkBestMode(md.pred[PRED_INTRA], depth);
 
         if (cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3)
         {
-            md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom, qp);
+            md.pred[PRED_INTRA_NxN].cu.initSubCU<I_SLICE>(parentCTU, cuGeom, qp);
             checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN);
             checkBestMode(md.pred[PRED_INTRA_NxN], depth);
         }
@@ -341,7 +344,7 @@
         Mode* splitPred = &md.pred[PRED_SPLIT];
         splitPred->initCosts();
         CUData* splitCU = &splitPred->cu;
-        splitCU->initSubCU(parentCTU, cuGeom, qp);
+        splitCU->initSubCU<I_SLICE>(parentCTU, cuGeom, qp);
 
         uint32_t nextDepth = depth + 1;
         ModeDepth& nd = m_modeDepth[nextDepth];
@@ -363,7 +366,7 @@
                 compressIntraCU(parentCTU, childGeom, nextQP);
 
                 // Save best CU and pred data for this sub CU
-                splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
+                splitCU->copyPartFrom<I_SLICE>(nd.bestMode->cu, childGeom, subPartIdx);
                 splitPred->addSubCosts(*nd.bestMode);
                 nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
                 nextContext = &nd.bestMode->contexts;
@@ -395,7 +398,7 @@
     }
 
     /* Copy best data to encData CTU and recon */
-    md.bestMode->cu.copyToPic(depth);
+    md.bestMode->cu.copyToPic<I_SLICE>(depth);
     if (md.bestMode != &md.pred[PRED_SPLIT])
         md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.absPartIdx);
 }
@@ -610,8 +613,8 @@
     if (mightNotSplit && depth >= minDepth)
     {
         /* Initialize all prediction CUs based on parentCTU */
-        md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
-        md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
+        CALL_CUDATA_FUNC(&md.pred[PRED_MERGE].cu, initSubCU, parentCTU, cuGeom, qp);
+        CALL_CUDATA_FUNC(&md.pred[PRED_SKIP].cu, initSubCU, parentCTU, cuGeom, qp);
 
         if (m_param->rdLevel <= 4)
             checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
@@ -633,7 +636,7 @@
         Mode* splitPred = &md.pred[PRED_SPLIT];
         splitPred->initCosts();
         CUData* splitCU = &splitPred->cu;
-        splitCU->initSubCU(parentCTU, cuGeom, qp);
+        CALL_CUDATA_FUNC(splitCU, initSubCU, parentCTU, cuGeom, qp);
 
         uint32_t nextDepth = depth + 1;
         ModeDepth& nd = m_modeDepth[nextDepth];
@@ -657,7 +660,7 @@
 
                 // Save best CU and pred data for this sub CU
                 splitIntra |= nd.bestMode->cu.isIntra(0);
-                splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
+                CALL_CUDATA_FUNC(splitCU, copyPartFrom, nd.bestMode->cu, childGeom, subPartIdx);
                 splitPred->addSubCosts(*nd.bestMode);
 
                 nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
@@ -686,24 +689,24 @@
 
         if (bTryIntra)
         {
-            md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);
+            CALL_CUDATA_FUNC(&md.pred[PRED_INTRA].cu, initSubCU, parentCTU, cuGeom, qp);
             if (cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3 && m_param->rdLevel >= 5)
-                md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom, qp);
+                CALL_CUDATA_FUNC(&md.pred[PRED_INTRA_NxN].cu, initSubCU, parentCTU, cuGeom, qp);
             pmode.modes[pmode.m_jobTotal++] = PRED_INTRA;
         }
-        md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_2Nx2N;
-        md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom, qp);
+        CALL_CUDATA_FUNC(&md.pred[PRED_2Nx2N].cu, initSubCU, parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_2Nx2N;
+        CALL_CUDATA_FUNC(&md.pred[PRED_BIDIR].cu, initSubCU, parentCTU, cuGeom, qp);
         if (m_param->bEnableRectInter)
         {
-            md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_2NxN;
-            md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_Nx2N;
+            CALL_CUDATA_FUNC(&md.pred[PRED_2NxN].cu, initSubCU, parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_2NxN;
+            CALL_CUDATA_FUNC(&md.pred[PRED_Nx2N].cu, initSubCU, parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_Nx2N;
         }
         if (bTryAmp)
         {
-            md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_2NxnU;
-            md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_2NxnD;
-            md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_nLx2N;
-            md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_nRx2N;
+            CALL_CUDATA_FUNC(&md.pred[PRED_2NxnU].cu, initSubCU, parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_2NxnU;
+            CALL_CUDATA_FUNC(&md.pred[PRED_2NxnD].cu, initSubCU, parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_2NxnD;
+            CALL_CUDATA_FUNC(&md.pred[PRED_nLx2N].cu, initSubCU, parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_nLx2N;
+            CALL_CUDATA_FUNC(&md.pred[PRED_nRx2N].cu, initSubCU, parentCTU, cuGeom, qp); pmode.modes[pmode.m_jobTotal++] = PRED_nRx2N;
         }
 
         m_splitRefIdx[0] = splitRefs[0]; m_splitRefIdx[1] = splitRefs[1]; m_splitRefIdx[2] = splitRefs[2]; m_splitRefIdx[3] = splitRefs[3];
@@ -870,7 +873,7 @@
     }
 
     /* Copy best data to encData CTU and recon */
-    md.bestMode->cu.copyToPic(depth);
+    CALL_CUDATA_FUNC(&md.bestMode->cu, copyToPic, depth);
     md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, cuAddr, cuGeom.absPartIdx);
 
     return refMask;
@@ -914,8 +917,8 @@
         {
             if (m_reuseModes[cuGeom.absPartIdx] == MODE_SKIP)
             {
-                md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
-                md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
+                CALL_CUDATA_FUNC(&md.pred[PRED_MERGE].cu, initSubCU, parentCTU, cuGeom, qp);
+                CALL_CUDATA_FUNC(&md.pred[PRED_SKIP].cu, initSubCU, parentCTU, cuGeom, qp);
                 checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
 
                 skipRecursion = !!m_param->bEnableRecursionSkip && md.bestMode;
@@ -937,8 +940,8 @@
     if (mightNotSplit && depth >= minDepth && !md.bestMode) /* TODO: Re-evaluate if analysis load/save still works */
     {
         /* Compute Merge Cost */
-        md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
-        md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
+        CALL_CUDATA_FUNC(&md.pred[PRED_MERGE].cu, initSubCU, parentCTU, cuGeom, qp);
+        CALL_CUDATA_FUNC(&md.pred[PRED_SKIP].cu, initSubCU, parentCTU, cuGeom, qp);
         checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
         if (m_param->rdLevel)
             skipModes = m_param->bEnableEarlySkip && md.bestMode && md.bestMode->cu.isSkipped(0); // TODO: sa8d threshold per depth
@@ -962,7 +965,7 @@
         Mode* splitPred = &md.pred[PRED_SPLIT];
         splitPred->initCosts();
         CUData* splitCU = &splitPred->cu;
-        splitCU->initSubCU(parentCTU, cuGeom, qp);
+        CALL_CUDATA_FUNC(splitCU, initSubCU, parentCTU, cuGeom, qp);
 
         uint32_t nextDepth = depth + 1;
         ModeDepth& nd = m_modeDepth[nextDepth];
@@ -986,7 +989,7 @@
 
                 // Save best CU and pred data for this sub CU
                 splitIntra |= nd.bestMode->cu.isIntra(0);
-                splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
+                CALL_CUDATA_FUNC(splitCU, copyPartFrom, nd.bestMode->cu, childGeom, subPartIdx);
                 splitPred->addSubCosts(*nd.bestMode);
 
                 if (m_param->rdLevel)
@@ -1023,7 +1026,7 @@
         {
             uint32_t refMasks[2];
             refMasks[0] = allSplitRefs;
-            md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
+            CALL_CUDATA_FUNC(&md.pred[PRED_2Nx2N].cu, initSubCU, parentCTU, cuGeom, qp);
             checkInter_rd0_4(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
 
             if (m_param->limitReferences & X265_REF_LIMIT_CU)
@@ -1035,7 +1038,7 @@
 
             if (m_slice->m_sliceType == B_SLICE)
             {
-                md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom, qp);
+                md.pred[PRED_BIDIR].cu.initSubCU<B_SLICE>(parentCTU, cuGeom, qp);
                 checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], cuGeom);
             }
 
@@ -1065,7 +1068,7 @@
                     {
                         refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* top */
                         refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* bot */
-                        md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);
+                        CALL_CUDATA_FUNC(&md.pred[PRED_2NxN].cu, initSubCU, parentCTU, cuGeom, qp);
                         checkInter_rd0_4(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks);
                         if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost)
                             bestInter = &md.pred[PRED_2NxN];
@@ -1075,7 +1078,7 @@
                     {
                         refMasks[0] = splitData[0].splitRefs | splitData[2].splitRefs; /* left */
                         refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* right */
-                        md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
+                        CALL_CUDATA_FUNC(&md.pred[PRED_Nx2N].cu, initSubCU, parentCTU, cuGeom, qp);
                         checkInter_rd0_4(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N, refMasks);
                         if (md.pred[PRED_Nx2N].sa8dCost < bestInter->sa8dCost)
                             bestInter = &md.pred[PRED_Nx2N];
@@ -1085,7 +1088,7 @@
                     {
                         refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* top */
                         refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* bot */
-                        md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);
+                        CALL_CUDATA_FUNC(&md.pred[PRED_2NxN].cu, initSubCU, parentCTU, cuGeom, qp);
                         checkInter_rd0_4(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks);
                         if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost)
                             bestInter = &md.pred[PRED_2NxN];
@@ -1137,7 +1140,7 @@
                         {
                             refMasks[0] = allSplitRefs;                                    /* 75% top */
                             refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* 25% bot */
-                            md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp);
+                            CALL_CUDATA_FUNC(&md.pred[PRED_2NxnD].cu, initSubCU, parentCTU, cuGeom, qp);
                             checkInter_rd0_4(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks);
                             if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost)
                                 bestInter = &md.pred[PRED_2NxnD];
@@ -1147,7 +1150,7 @@
                         {
                             refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* 25% top */
                             refMasks[1] = allSplitRefs;                                    /* 75% bot */
-                            md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom, qp);
+                            CALL_CUDATA_FUNC(&md.pred[PRED_2NxnU].cu, initSubCU, parentCTU, cuGeom, qp);
                             checkInter_rd0_4(md.pred[PRED_2NxnU], cuGeom, SIZE_2NxnU, refMasks);
                             if (md.pred[PRED_2NxnU].sa8dCost < bestInter->sa8dCost)
                                 bestInter = &md.pred[PRED_2NxnU];
@@ -1157,7 +1160,7 @@
                         {
                             refMasks[0] = allSplitRefs;                                    /* 75% top */
                             refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* 25% bot */
-                            md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp);
+                            CALL_CUDATA_FUNC(&md.pred[PRED_2NxnD].cu, initSubCU, parentCTU, cuGeom, qp);
                             checkInter_rd0_4(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks);
                             if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost)
                                 bestInter = &md.pred[PRED_2NxnD];
@@ -1170,7 +1173,7 @@
                         {
                             refMasks[0] = allSplitRefs;                                    /* 75% left  */
                             refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* 25% right */
-                            md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp);
+                            CALL_CUDATA_FUNC(&md.pred[PRED_nRx2N].cu, initSubCU, parentCTU, cuGeom, qp);
                             checkInter_rd0_4(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks);
                             if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost)
                                 bestInter = &md.pred[PRED_nRx2N];
@@ -1180,7 +1183,7 @@
                         {
                             refMasks[0] = splitData[0].splitRefs | splitData[2].splitRefs; /* 25% left  */
                             refMasks[1] = allSplitRefs;                                    /* 75% right */
-                            md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom, qp);
+                            CALL_CUDATA_FUNC(&md.pred[PRED_nLx2N].cu, initSubCU, parentCTU, cuGeom, qp);
                             checkInter_rd0_4(md.pred[PRED_nLx2N], cuGeom, SIZE_nLx2N, refMasks);
                             if (md.pred[PRED_nLx2N].sa8dCost < bestInter->sa8dCost)
                                 bestInter = &md.pred[PRED_nLx2N];
@@ -1190,7 +1193,7 @@
                         {
                             refMasks[0] = allSplitRefs;                                    /* 75% left  */
                             refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* 25% right */
-                            md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp);
+                            CALL_CUDATA_FUNC(&md.pred[PRED_nRx2N].cu, initSubCU, parentCTU, cuGeom, qp);
                             checkInter_rd0_4(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks);
                             if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost)
                                 bestInter = &md.pred[PRED_nRx2N];
@@ -1239,7 +1242,7 @@
                     if (!m_param->limitReferences || splitIntra)
                     {
                         ProfileCounter(parentCTU, totalIntraCU[cuGeom.depth]);
-                        md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);
+                        CALL_CUDATA_FUNC(&md.pred[PRED_INTRA].cu, initSubCU, parentCTU, cuGeom, qp);
                         checkIntraInInter(md.pred[PRED_INTRA], cuGeom);
                         encodeIntraInInter(md.pred[PRED_INTRA], cuGeom);
                         checkBestMode(md.pred[PRED_INTRA], depth);
@@ -1265,7 +1268,7 @@
                     if (!m_param->limitReferences || splitIntra)
                     {
                         ProfileCounter(parentCTU, totalIntraCU[cuGeom.depth]);
-                        md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);
+                        CALL_CUDATA_FUNC(&md.pred[PRED_INTRA].cu, initSubCU, parentCTU, cuGeom, qp);
                         checkIntraInInter(md.pred[PRED_INTRA], cuGeom);
                         if (md.pred[PRED_INTRA].sa8dCost < md.bestMode->sa8dCost)
                             md.bestMode = &md.pred[PRED_INTRA];
@@ -1396,7 +1399,7 @@
     }
 
     /* Copy best data to encData CTU and recon */
-    md.bestMode->cu.copyToPic(depth);
+    CALL_CUDATA_FUNC(&md.bestMode->cu, copyToPic, depth);
     if (m_param->rdLevel)
         md.bestMode->reconYuv.copyToPicYuv(reconPic, cuAddr, cuGeom.absPartIdx);
 
@@ -1437,12 +1440,12 @@
         {
             if (m_reuseModes[cuGeom.absPartIdx] == MODE_SKIP)
             {
-                md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
-                md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
+                CALL_CUDATA_FUNC(&md.pred[PRED_SKIP].cu, initSubCU, parentCTU, cuGeom, qp);
+                CALL_CUDATA_FUNC(&md.pred[PRED_MERGE].cu, initSubCU, parentCTU, cuGeom, qp);
                 checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
                 skipModes = !!m_param->bEnableEarlySkip && md.bestMode;
                 refMasks[0] = allSplitRefs;
-                md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
+                CALL_CUDATA_FUNC(&md.pred[PRED_2Nx2N].cu, initSubCU, parentCTU, cuGeom, qp);
                 checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
                 checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth);
 
@@ -1457,12 +1460,12 @@
     /* Step 1. Evaluate Merge/Skip candidates for likely early-outs */
     if (mightNotSplit && !md.bestMode)
     {
-        md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
-        md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
+        CALL_CUDATA_FUNC(&md.pred[PRED_SKIP].cu, initSubCU, parentCTU, cuGeom, qp);
+        CALL_CUDATA_FUNC(&md.pred[PRED_MERGE].cu, initSubCU, parentCTU, cuGeom, qp);
         checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
         skipModes = m_param->bEnableEarlySkip && md.bestMode && !md.bestMode->cu.getQtRootCbf(0);
         refMasks[0] = allSplitRefs;
-        md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
+        CALL_CUDATA_FUNC(&md.pred[PRED_2Nx2N].cu, initSubCU, parentCTU, cuGeom, qp);
         checkInter_rd5_6(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);
         checkBestMode(md.pred[PRED_2Nx2N], cuGeom.depth);
 
@@ -1477,7 +1480,7 @@
         Mode* splitPred = &md.pred[PRED_SPLIT];
         splitPred->initCosts();
         CUData* splitCU = &splitPred->cu;
-        splitCU->initSubCU(parentCTU, cuGeom, qp);
+        CALL_CUDATA_FUNC(splitCU, initSubCU, parentCTU, cuGeom, qp);
 
         uint32_t nextDepth = depth + 1;
         ModeDepth& nd = m_modeDepth[nextDepth];
@@ -1501,7 +1504,7 @@
 
                 // Save best CU and pred data for this sub CU
                 splitIntra |= nd.bestMode->cu.isIntra(0);
-                splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
+                CALL_CUDATA_FUNC(splitCU, copyPartFrom, nd.bestMode->cu, childGeom, subPartIdx);
                 splitPred->addSubCosts(*nd.bestMode);
                 nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
                 nextContext = &nd.bestMode->contexts;
@@ -1543,7 +1546,7 @@
 
             if (m_slice->m_sliceType == B_SLICE)
             {
-                md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom, qp);
+                md.pred[PRED_BIDIR].cu.initSubCU<B_SLICE>(parentCTU, cuGeom, qp);
                 checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], cuGeom);
                 if (md.pred[PRED_BIDIR].sa8dCost < MAX_INT64)
                 {
@@ -1584,7 +1587,7 @@
                     {
                         refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* top */
                         refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* bot */
-                        md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);
+                        CALL_CUDATA_FUNC(&md.pred[PRED_2NxN].cu, initSubCU, parentCTU, cuGeom, qp);
                         checkInter_rd5_6(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks);
                         checkBestMode(md.pred[PRED_2NxN], cuGeom.depth);
                     }
@@ -1593,7 +1596,7 @@
                     {
                         refMasks[0] = splitData[0].splitRefs | splitData[2].splitRefs; /* left */
                         refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* right */
-                        md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
+                        CALL_CUDATA_FUNC(&md.pred[PRED_Nx2N].cu, initSubCU, parentCTU, cuGeom, qp);
                         checkInter_rd5_6(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N, refMasks);
                         checkBestMode(md.pred[PRED_Nx2N], cuGeom.depth);
                     }
@@ -1602,7 +1605,7 @@
                     {
                         refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* top */
                         refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* bot */
-                        md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);
+                        CALL_CUDATA_FUNC(&md.pred[PRED_2NxN].cu, initSubCU, parentCTU, cuGeom, qp);
                         checkInter_rd5_6(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks);
                         checkBestMode(md.pred[PRED_2NxN], cuGeom.depth);
                     }
@@ -1653,7 +1656,7 @@
                         {
                             refMasks[0] = allSplitRefs;                                    /* 75% top */
                             refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* 25% bot */
-                            md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp);
+                            CALL_CUDATA_FUNC(&md.pred[PRED_2NxnD].cu, initSubCU, parentCTU, cuGeom, qp);
                             checkInter_rd5_6(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks);
                             checkBestMode(md.pred[PRED_2NxnD], cuGeom.depth);
                         }
@@ -1662,7 +1665,7 @@
                         {
                             refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* 25% top */
                             refMasks[1] = allSplitRefs;                                    /* 75% bot */
-                            md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom, qp);
+                            CALL_CUDATA_FUNC(&md.pred[PRED_2NxnU].cu, initSubCU, parentCTU, cuGeom, qp);
                             checkInter_rd5_6(md.pred[PRED_2NxnU], cuGeom, SIZE_2NxnU, refMasks);
                             checkBestMode(md.pred[PRED_2NxnU], cuGeom.depth);
                         }
@@ -1671,7 +1674,7 @@
                         {
                             refMasks[0] = allSplitRefs;                                    /* 75% top */
                             refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* 25% bot */
-                            md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp);
+                            CALL_CUDATA_FUNC(&md.pred[PRED_2NxnD].cu, initSubCU, parentCTU, cuGeom, qp);
                             checkInter_rd5_6(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks);
                             checkBestMode(md.pred[PRED_2NxnD], cuGeom.depth);
                         }
@@ -1684,7 +1687,7 @@
                         {
                             refMasks[0] = allSplitRefs;                                    /* 75% left  */
                             refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* 25% right */
-                            md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp);
+                            CALL_CUDATA_FUNC(&md.pred[PRED_nRx2N].cu, initSubCU, parentCTU, cuGeom, qp);
                             checkInter_rd5_6(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks);
                             checkBestMode(md.pred[PRED_nRx2N], cuGeom.depth);
                         }
@@ -1693,7 +1696,7 @@
                         {
                             refMasks[0] = splitData[0].splitRefs | splitData[2].splitRefs; /* 25% left  */
                             refMasks[1] = allSplitRefs;                                    /* 75% right */
-                            md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom, qp);
+                            CALL_CUDATA_FUNC(&md.pred[PRED_nLx2N].cu, initSubCU, parentCTU, cuGeom, qp);
                             checkInter_rd5_6(md.pred[PRED_nLx2N], cuGeom, SIZE_nLx2N, refMasks);
                             checkBestMode(md.pred[PRED_nLx2N], cuGeom.depth);
                         }
@@ -1702,7 +1705,7 @@
                         {
                             refMasks[0] = allSplitRefs;                                    /* 75% left  */
                             refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* 25% right */
-                            md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp);
+                            CALL_CUDATA_FUNC(&md.pred[PRED_nRx2N].cu, initSubCU, parentCTU, cuGeom, qp);
                             checkInter_rd5_6(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks);
                             checkBestMode(md.pred[PRED_nRx2N], cuGeom.depth);
                         }
@@ -1715,13 +1718,13 @@
                 if (!m_param->limitReferences || splitIntra)
                 {
                     ProfileCounter(parentCTU, totalIntraCU[cuGeom.depth]);
-                    md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);
+                    CALL_CUDATA_FUNC(&md.pred[PRED_INTRA].cu, initSubCU, parentCTU, cuGeom, qp);
                     checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N);
                     checkBestMode(md.pred[PRED_INTRA], depth);
 
                     if (cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3)
                     {
-                        md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom, qp);
+                        CALL_CUDATA_FUNC(&md.pred[PRED_INTRA_NxN].cu, initSubCU, parentCTU, cuGeom, qp);
                         checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN);
                         checkBestMode(md.pred[PRED_INTRA_NxN], depth);
                     }
@@ -1786,7 +1789,7 @@
     }
 
     /* Copy best data to encData CTU and recon */
-    md.bestMode->cu.copyToPic(depth);
+    CALL_CUDATA_FUNC(&md.bestMode->cu, copyToPic, depth);
     md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.absPartIdx);
 
     return splitCUData;
@@ -1808,7 +1811,7 @@
 
         Mode& mode = md.pred[0];
         md.bestMode = &mode;
-        mode.cu.initSubCU(parentCTU, cuGeom, qp);
+        CALL_CUDATA_FUNC(&mode.cu, initSubCU, parentCTU, cuGeom, qp);
         PartSize size = (PartSize)parentCTU.m_partSize[cuGeom.absPartIdx];
         if (parentCTU.isIntra(cuGeom.absPartIdx))
         {
@@ -1818,7 +1821,7 @@
         }
         else
         {
-            mode.cu.copyFromPic(parentCTU, cuGeom, m_csp, false);
+            CALL_CUDATA_FUNC(&mode.cu, copyFromPic, parentCTU, cuGeom, m_csp, false);
             for (int part = 0; part < (int)parentCTU.getNumPartInter(cuGeom.absPartIdx); part++)
             {
                 PredictionUnit pu(mode.cu, cuGeom, part);
@@ -1848,7 +1851,7 @@
         md.bestMode = splitPred;
         splitPred->initCosts();
         CUData* splitCU = &splitPred->cu;
-        splitCU->initSubCU(parentCTU, cuGeom, qp);
+        CALL_CUDATA_FUNC(splitCU, initSubCU, parentCTU, cuGeom, qp);
 
         uint32_t nextDepth = depth + 1;
         ModeDepth& nd = m_modeDepth[nextDepth];
@@ -1870,7 +1873,7 @@
                 qprdRefine(parentCTU, childGeom, nextQP, lqp);
 
                 // Save best CU and pred data for this sub CU
-                splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
+                CALL_CUDATA_FUNC(splitCU, copyPartFrom, nd.bestMode->cu, childGeom, subPartIdx);
                 splitPred->addSubCosts(*nd.bestMode);
                 nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
                 nextContext = &nd.bestMode->contexts;
@@ -1891,7 +1894,7 @@
         checkDQPForSplitPred(*splitPred, cuGeom);
 
         /* Copy best data to encData CTU and recon */
-        md.bestMode->cu.copyToPic(depth);
+        CALL_CUDATA_FUNC(&md.bestMode->cu, copyToPic, depth);
         md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.absPartIdx);
     }
 }
@@ -2384,7 +2387,7 @@
     Mode *bestMode = m_modeDepth[cuGeom.depth].bestMode;
     CUData& cu = bestMode->cu;
 
-    cu.copyFromPic(ctu, cuGeom, m_csp);
+    CALL_CUDATA_FUNC(&cu, copyFromPic, ctu, cuGeom, m_csp, true);
 
     PicYuv& reconPic = *m_frame->m_reconPic;
 
diff -r df559450949b -r b24cf6bc3795 source/encoder/frameencoder.cpp
--- a/source/encoder/frameencoder.cpp	Wed Aug 10 13:26:18 2016 +0530
+++ b/source/encoder/frameencoder.cpp	Wed Sep 07 01:21:25 2016 +0800
@@ -953,7 +953,7 @@
         const uint32_t col = curRow.completed;
         const uint32_t cuAddr = lineStartCUAddr + col;
         CUData* ctu = curEncData.getPicCTU(cuAddr);
-        ctu->initCTU(*m_frame, cuAddr, slice->m_sliceQp);
+        CUDATA_FUNC(ctu, initCTU, slice->m_sliceType, *m_frame, cuAddr, slice->m_sliceQp);
 
         if (bIsVbv)
         {





More information about the x265-devel mailing list