<div dir="ltr">Thanks Satoshi. Pushed for testing.<br></div><div class="gmail_extra"><br><br><div class="gmail_quote">On Mon, Jun 2, 2014 at 8:17 AM, Satoshi Nakagawa <span dir="ltr"><<a href="mailto:nakagawa424@oki.com" target="_blank">nakagawa424@oki.com</a>></span> wrote:<br>

<blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"># HG changeset patch<br>
# User Satoshi Nakagawa <<a href="mailto:nakagawa424@oki.com">nakagawa424@oki.com</a>><br>
# Date 1401677099 -32400<br>
#      Mon Jun 02 11:44:59 2014 +0900<br>
# Node ID 73f86312c2e0aa5a105e84b0045478e02c8a03e7<br>
# Parent  a5998df9b12ef81e48e7c5b89219a74276a75f27<br>
refine cbf==0 path: remove clearing coeff and resi<br>
<br>
diff -r a5998df9b12e -r 73f86312c2e0 source/Lib/TLibEncoder/TEncEntropy.cpp<br>
--- a/source/Lib/TLibEncoder/TEncEntropy.cpp    Mon Jun 02 07:36:20 2014 +0530<br>
+++ b/source/Lib/TLibEncoder/TEncEntropy.cpp    Mon Jun 02 11:44:59 2014 +0900<br>
@@ -202,7 +202,6 @@<br>
<br>
 void TEncEntropy::initTUEntropySection(TComTURecurse *tuIterator, uint32_t splitMode, uint32_t absPartIdxStep, uint32_t m_absPartIdxTU)<br>
 {<br>
-    tuIterator->m_partOffset        = 0;<br>
     tuIterator->m_section           = 0;<br>
     tuIterator->m_absPartIdxTURelCU = m_absPartIdxTU;<br>
     tuIterator->m_splitMode         = splitMode;<br>
diff -r a5998df9b12e -r 73f86312c2e0 source/Lib/TLibEncoder/TEncEntropy.h<br>
--- a/source/Lib/TLibEncoder/TEncEntropy.h      Mon Jun 02 07:36:20 2014 +0530<br>
+++ b/source/Lib/TLibEncoder/TEncEntropy.h      Mon Jun 02 11:44:59 2014 +0900<br>
@@ -66,7 +66,6 @@<br>
     uint32_t          m_splitMode;<br>
     uint32_t          m_absPartIdxTURelCU;<br>
     uint32_t          m_absPartIdxStep;<br>
-    uint32_t          m_partOffset;<br>
 };<br>
<br>
 // ====================================================================================================================<br>
diff -r a5998df9b12e -r 73f86312c2e0 source/Lib/TLibEncoder/TEncSbac.cpp<br>
--- a/source/Lib/TLibEncoder/TEncSbac.cpp       Mon Jun 02 07:36:20 2014 +0530<br>
+++ b/source/Lib/TLibEncoder/TEncSbac.cpp       Mon Jun 02 11:44:59 2014 +0900<br>
@@ -2120,8 +2120,9 @@<br>
     // compute number of significant coefficients<br>
     uint32_t numSig = primitives.count_nonzero(coeff, trSize * trSize);<br>
<br>
-    if (numSig == 0)<br>
-        return;<br>
+#if CHECKED_BUILD || _DEBUG<br>
+    X265_CHECK(numSig > 0, "cbf check fail");<br>
+#endif<br>
<br>
     bool beValid;<br>
     if (cu->getCUTransquantBypass(absPartIdx))<br>
diff -r a5998df9b12e -r 73f86312c2e0 source/Lib/TLibEncoder/TEncSearch.cpp<br>
--- a/source/Lib/TLibEncoder/TEncSearch.cpp     Mon Jun 02 07:36:20 2014 +0530<br>
+++ b/source/Lib/TLibEncoder/TEncSearch.cpp     Mon Jun 02 11:44:59 2014 +0900<br>
@@ -408,8 +408,8 @@<br>
     coeff_t* coeff          = m_qtTempCoeff[0][qtLayer] + coeffOffsetY;<br>
<br>
     int16_t* reconQt        = m_qtTempShortYuv[qtLayer].getLumaAddr(absPartIdx);<br>
-<br>
     X265_CHECK(m_qtTempShortYuv[qtLayer].m_width == MAX_CU_SIZE, "width is not max CU size\n");<br>
+    const uint32_t reconQtStride = MAX_CU_SIZE;<br>
<br>
     uint32_t zorder           = cu->getZorderIdxInCU() + absPartIdx;<br>
     pixel*   reconIPred       = cu->getPic()->getPicYuvRec()->getLumaAddr(cu->getAddr(), zorder);<br>
@@ -443,25 +443,29 @@<br>
     //--- set coded block flag ---<br>
     cu->setCbfSubParts((absSum ? 1 : 0) << trDepth, TEXT_LUMA, absPartIdx, fullDepth);<br>
<br>
-    //--- inverse transform ---<br>
     if (absSum)<br>
     {<br>
+        //--- inverse transform ---<br>
         int scalingListType = 0 + TEXT_LUMA;<br>
-        X265_CHECK(scalingListType < 6, "scalingListType is too large %d\n", scalingListType);<br>
+        X265_CHECK(scalingListType < 6, "scalingListType invalid %d\n", scalingListType);<br>
         m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), cu->getLumaIntraDir(absPartIdx), residual, stride, coeff, tuSize, scalingListType, useTransformSkip, lastPos);<br>
+        X265_CHECK(tuSize <= 32, "tuSize is too large %d\n", tuSize);<br>
+        //===== reconstruction =====<br>
+        primitives.calcrecon[sizeIdx](pred, residual, reconQt, reconIPred, stride, reconQtStride, reconIPredStride);<br>
+        //===== update distortion =====<br>
+        outDist += primitives.sse_sp[part](reconQt, reconQtStride, fenc, stride);<br>
     }<br>
     else<br>
     {<br>
-        int16_t* resiTmp = residual;<br>
+#if CHECKED_BUILD || _DEBUG<br>
         memset(coeff, 0, sizeof(coeff_t) * tuSize * tuSize);<br>
-        primitives.blockfill_s[sizeIdx](resiTmp, stride, 0);<br>
+#endif<br>
+        //===== reconstruction =====<br>
+        primitives.luma_copy_ps[part](reconQt,    reconQtStride,    pred, stride);<br>
+        primitives.luma_copy_pp[part](reconIPred, reconIPredStride, pred, stride);<br>
+        //===== update distortion =====<br>
+        outDist += primitives.sse_pp[part](pred, stride, fenc, stride);<br>
     }<br>
-<br>
-    X265_CHECK(tuSize <= 32, "tuSize is too large %d\n", tuSize);<br>
-    //===== reconstruction =====<br>
-    primitives.calcrecon[sizeIdx](pred, residual, reconQt, reconIPred, stride, MAX_CU_SIZE, reconIPredStride);<br>
-    //===== update distortion =====<br>
-    outDist += primitives.sse_sp[part](reconQt, MAX_CU_SIZE, fenc, stride);<br>
 }<br>
<br>
 void TEncSearch::xIntraCodingChromaBlk(TComDataCU* cu,<br>
@@ -519,67 +523,67 @@<br>
     primitives.calcresidual[sizeIdx](fenc, pred, residual, stride);<br>
<br>
     //===== transform and quantization =====<br>
+    //--- init rate estimation arrays for RDOQ ---<br>
+    if (useTransformSkipChroma ? m_cfg->bEnableRDOQTS : m_cfg->bEnableRDOQ)<br>
     {<br>
-        //--- init rate estimation arrays for RDOQ ---<br>
-        if (useTransformSkipChroma ? m_cfg->bEnableRDOQTS : m_cfg->bEnableRDOQ)<br>
-        {<br>
-            m_entropyCoder->estimateBit(m_trQuant->m_estBitsSbac, tuSize, ttype);<br>
-        }<br>
-        //--- transform and quantization ---<br>
-        uint32_t absSum = 0;<br>
-        int lastPos = -1;<br>
-<br>
-        int curChromaQpOffset;<br>
-        if (ttype == TEXT_CHROMA_U)<br>
-        {<br>
-            curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCbQpOffset() + cu->getSlice()->getSliceQpDeltaCb();<br>
-        }<br>
-        else<br>
-        {<br>
-            curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCrQpOffset() + cu->getSlice()->getSliceQpDeltaCr();<br>
-        }<br>
-        m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);<br>
-<br>
-        m_trQuant->selectLambda(TEXT_CHROMA);<br>
-<br>
-        absSum = m_trQuant->transformNxN(cu, residual, stride, coeff, tuSize, ttype, absPartIdx, &lastPos, useTransformSkipChroma);<br>
-<br>
-        //--- set coded block flag ---<br>
-        cu->setCbfPartRange((((absSum > 0) ? 1 : 0) << origTrDepth), ttype, absPartIdx, absPartIdxStep);<br>
-<br>
+        m_entropyCoder->estimateBit(m_trQuant->m_estBitsSbac, tuSize, ttype);<br>
+    }<br>
+<br>
+    //--- transform and quantization ---<br>
+    uint32_t absSum = 0;<br>
+    int lastPos = -1;<br>
+<br>
+    int curChromaQpOffset;<br>
+    if (ttype == TEXT_CHROMA_U)<br>
+    {<br>
+        curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCbQpOffset() + cu->getSlice()->getSliceQpDeltaCb();<br>
+    }<br>
+    else<br>
+    {<br>
+        curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCrQpOffset() + cu->getSlice()->getSliceQpDeltaCr();<br>
+    }<br>
+    m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);<br>
+    m_trQuant->selectLambda(TEXT_CHROMA);<br>
+<br>
+    absSum = m_trQuant->transformNxN(cu, residual, stride, coeff, tuSize, ttype, absPartIdx, &lastPos, useTransformSkipChroma);<br>
+<br>
+    //--- set coded block flag ---<br>
+    cu->setCbfPartRange((((absSum > 0) ? 1 : 0) << origTrDepth), ttype, absPartIdx, absPartIdxStep);<br>
+<br>
+    uint32_t dist;<br>
+    if (absSum)<br>
+    {<br>
         //--- inverse transform ---<br>
-        if (absSum)<br>
-        {<br>
-            int scalingListType = 0 + ttype;<br>
-            X265_CHECK(scalingListType < 6, "scalingListType invalid %d\n", scalingListType);<br>
-            m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), REG_DCT, residual, stride, coeff, tuSize, scalingListType, useTransformSkipChroma, lastPos);<br>
-        }<br>
-        else<br>
-        {<br>
-            int16_t* resiTmp = residual;<br>
-            memset(coeff, 0, sizeof(coeff_t) * tuSize * tuSize);<br>
-            primitives.blockfill_s[sizeIdx](resiTmp, stride, 0);<br>
-        }<br>
+        int scalingListType = 0 + ttype;<br>
+        X265_CHECK(scalingListType < 6, "scalingListType invalid %d\n", scalingListType);<br>
+        m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), REG_DCT, residual, stride, coeff, tuSize, scalingListType, useTransformSkipChroma, lastPos);<br>
+        X265_CHECK(tuSize <= 32, "tuSize is too large %d\n", tuSize);<br>
+        //===== reconstruction =====<br>
+        primitives.calcrecon[sizeIdx](pred, residual, reconQt, reconIPred, stride, reconQtStride, reconIPredStride);<br>
+        //===== update distortion =====<br>
+        dist = primitives.sse_sp[part](reconQt, reconQtStride, fenc, stride);<br>
     }<br>
-<br>
-    X265_CHECK(((intptr_t)residual & (tuSize - 1)) == 0, "residual alignment check failure\n");<br>
-    X265_CHECK(tuSize <= 32, "tuSize invalud\n");<br>
-    //===== reconstruction =====<br>
-    primitives.calcrecon[sizeIdx](pred, residual, reconQt, reconIPred, stride, reconQtStride, reconIPredStride);<br>
-    //===== update distortion =====<br>
-    uint32_t dist = primitives.sse_sp[part](reconQt, reconQtStride, fenc, stride);<br>
+    else<br>
+    {<br>
+#if CHECKED_BUILD || _DEBUG<br>
+        memset(coeff, 0, sizeof(coeff_t) * tuSize * tuSize);<br>
+#endif<br>
+        //===== reconstruction =====<br>
+        primitives.square_copy_ps[sizeIdx](reconQt,    reconQtStride,    pred, stride);<br>
+        primitives.square_copy_pp[sizeIdx](reconIPred, reconIPredStride, pred, stride);<br>
+        //===== update distortion =====<br>
+        dist = primitives.sse_pp[part](pred, stride, fenc, stride);<br>
+    }<br>
+<br>
+    X265_CHECK(ttype == TEXT_CHROMA_U || ttype == TEXT_CHROMA_V, "invalid ttype\n");<br>
     if (ttype == TEXT_CHROMA_U)<br>
     {<br>
         outDist += m_rdCost->scaleChromaDistCb(dist);<br>
     }<br>
-    else if (ttype == TEXT_CHROMA_V)<br>
+    else<br>
     {<br>
         outDist += m_rdCost->scaleChromaDistCr(dist);<br>
     }<br>
-    else<br>
-    {<br>
-        outDist += dist;<br>
-    }<br>
 }<br>
<br>
 void TEncSearch::xRecurIntraCodingQT(TComDataCU* cu,<br>
@@ -784,15 +788,15 @@<br>
         cu->setTransformSkipSubParts(bestModeId, TEXT_LUMA, absPartIdx, fullDepth);<br>
<br>
         //--- set reconstruction for next intra prediction blocks ---<br>
-        uint32_t width     = cu->getCUSize(0) >> trDepth;<br>
-        uint32_t height    = cu->getCUSize(0) >> trDepth;<br>
         uint32_t qtLayer   = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - trSizeLog2;<br>
         uint32_t zorder    = cu->getZorderIdxInCU() + absPartIdx;<br>
         int16_t* src       = m_qtTempShortYuv[qtLayer].getLumaAddr(absPartIdx);<br>
         X265_CHECK(m_qtTempShortYuv[qtLayer].m_width == MAX_CU_SIZE, "width is not max CU size\n");<br>
+        const uint32_t srcstride = MAX_CU_SIZE;<br>
         pixel*   dst       = cu->getPic()->getPicYuvRec()->getLumaAddr(cu->getAddr(), zorder);<br>
         uint32_t dststride = cu->getPic()->getPicYuvRec()->getStride();<br>
-        primitives.blockcpy_ps(width, height, dst, dststride, src, MAX_CU_SIZE);<br>
+        int sizeIdx = trSizeLog2 - 2;<br>
+        primitives.square_copy_sp[sizeIdx](dst, dststride, src, srcstride);<br>
     }<br>
<br>
     outDistY += singleDistY;<br>
@@ -866,25 +870,29 @@<br>
         //--- set coded block flag ---<br>
         cu->setCbfSubParts((absSum ? 1 : 0) << trDepth, TEXT_LUMA, absPartIdx, fullDepth);<br>
<br>
-        //--- inverse transform ---<br>
+        int part = partitionFromSize(tuSize);<br>
+<br>
         if (absSum)<br>
         {<br>
+            //--- inverse transform ---<br>
             int scalingListType = 0 + TEXT_LUMA;<br>
             X265_CHECK(scalingListType < 6, "scalingListType %d\n", scalingListType);<br>
             m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), cu->getLumaIntraDir(absPartIdx), residual, stride, coeff, tuSize, scalingListType, useTransformSkip, lastPos);<br>
+<br>
+            // Generate Recon<br>
+            primitives.luma_add_ps[part](recon, stride, pred, residual, stride, stride);<br>
+            primitives.luma_copy_pp[part](reconIPred, reconIPredStride, recon, stride);<br>
         }<br>
         else<br>
         {<br>
-            int16_t* resiTmp = residual;<br>
+#if CHECKED_BUILD || _DEBUG<br>
             memset(coeff, 0, sizeof(coeff_t) * tuSize * tuSize);<br>
-            primitives.blockfill_s[sizeIdx](resiTmp, stride, 0);<br>
+#endif<br>
+<br>
+            // Generate Recon<br>
+            primitives.luma_copy_pp[part](recon,      stride,           pred, stride);<br>
+            primitives.luma_copy_pp[part](reconIPred, reconIPredStride, pred, stride);<br>
         }<br>
-<br>
-        //Generate Recon<br>
-        X265_CHECK(tuSize <= 32, "tuSize is too large\n");<br>
-        int part = partitionFromSize(tuSize);<br>
-        primitives.luma_add_ps[part](recon, stride, pred, residual, stride, stride);<br>
-        primitives.blockcpy_pp(tuSize, tuSize, reconIPred, reconIPredStride, recon, stride);<br>
     }<br>
<br>
     if (bCheckSplit && !bCheckFull)<br>
@@ -980,8 +988,10 @@<br>
     pixel*     reconIPred       = cu->getPic()->getPicYuvRec()->getLumaAddr(cu->getAddr(), zOrder);<br>
     uint32_t   reconIPredStride = cu->getPic()->getPicYuvRec()->getStride();<br>
     int16_t*   reconQt          = m_qtTempShortYuv[qtlayer].getLumaAddr(absPartIdx);<br>
-    primitives.blockcpy_ps(trSize, trSize, reconIPred, reconIPredStride, reconQt, MAX_CU_SIZE);<br>
     X265_CHECK(m_qtTempShortYuv[qtlayer].m_width == MAX_CU_SIZE, "width is not max CU size\n");<br>
+    const uint32_t reconQtStride = MAX_CU_SIZE;<br>
+    int sizeIdx = trSizeLog2 - 2;<br>
+    primitives.square_copy_sp[sizeIdx](reconIPred, reconIPredStride, reconQt, reconQtStride);<br>
 }<br>
<br>
 void TEncSearch::xStoreIntraResultChromaQT(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, uint32_t chromaId, const bool splitIntoSubTUs)<br>
@@ -1059,8 +1069,7 @@<br>
         }<br>
<br>
         //===== copy transform coefficients =====<br>
-        uint32_t trSizeC  = 1 << trSizeCLog2;<br>
-        uint32_t numCoeffC = 1 << trSizeCLog2 * 2;<br>
+        uint32_t numCoeffC = 1 << (trSizeCLog2 * 2);<br>
         uint32_t coeffOffsetC = absPartIdx << (cu->getPic()->getLog2UnitSize() * 2 - (m_hChromaShift + m_vChromaShift));<br>
<br>
         coeff_t* coeffDst = m_qtTempCoeff[chromaId][qtlayer] + coeffOffsetC;<br>
@@ -1072,12 +1081,13 @@<br>
         m_qtTempTransformSkipYuv.copyPartToPartChroma(&m_qtTempShortYuv[qtlayer], absPartIdx, lumaSize, chromaId, splitIntoSubTUs);<br>
<br>
         uint32_t zorder           = cu->getZorderIdxInCU() + absPartIdx;<br>
-        uint32_t reconQtStride    = m_qtTempShortYuv[qtlayer].m_cwidth;<br>
         uint32_t reconIPredStride = cu->getPic()->getPicYuvRec()->getCStride();<br>
<br>
         pixel* reconIPred = cu->getPic()->getPicYuvRec()->getChromaAddr(chromaId, cu->getAddr(), zorder);<br>
         int16_t* reconQt  = m_qtTempShortYuv[qtlayer].getChromaAddr(chromaId, absPartIdx);<br>
-        primitives.blockcpy_ps(trSizeC, trSizeC, reconIPred, reconIPredStride, reconQt, reconQtStride);<br>
+        uint32_t reconQtStride    = m_qtTempShortYuv[qtlayer].m_cwidth;<br>
+        int sizeIdxC = trSizeCLog2 - 2;<br>
+        primitives.square_copy_sp[sizeIdxC](reconIPred, reconIPredStride, reconQt, reconQtStride);<br>
     }<br>
 }<br>
<br>
@@ -1387,6 +1397,7 @@<br>
         uint32_t stride = fencYuv->getCStride();<br>
         const bool splitIntoSubTUs = (chFmt == CHROMA_422);<br>
         int sizeIdx = g_convertToBit[tuSize];<br>
+        int part = partitionFromSize(tuSize);<br>
<br>
         for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++)<br>
         {<br>
@@ -1456,28 +1467,28 @@<br>
                 //--- set coded block flag ---<br>
                 cu->setCbfPartRange((((absSum > 0) ? 1 : 0) << origTrDepth), ttype, absPartIdxC, tuIterator.m_absPartIdxStep);<br>
<br>
-                //--- inverse transform ---<br>
                 if (absSum)<br>
                 {<br>
+                    //--- inverse transform ---<br>
                     int scalingListType = 0 + ttype;<br>
                     X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);<br>
                     m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, residual, stride, coeff, tuSize, scalingListType, useTransformSkipChroma, lastPos);<br>
+<br>
+                    //===== reconstruction =====<br>
+                    // use square primitives<br>
+                    primitives.chroma[CHROMA_444].add_ps[part](recon, stride, pred, residual, stride, stride);<br>
+                    primitives.square_copy_pp[sizeIdx](reconIPred, reconIPredStride, recon, stride);<br>
                 }<br>
                 else<br>
                 {<br>
-                    int16_t* resiTmp = residual;<br>
+#if CHECKED_BUILD || _DEBUG<br>
                     memset(coeff, 0, sizeof(coeff_t) * tuSize * tuSize);<br>
-                    primitives.blockfill_s[sizeIdx](resiTmp, stride, 0);<br>
+#endif<br>
+<br>
+                    //===== reconstruction =====<br>
+                    primitives.square_copy_pp[sizeIdx](recon,      stride,           pred, stride);<br>
+                    primitives.square_copy_pp[sizeIdx](reconIPred, reconIPredStride, pred, stride);<br>
                 }<br>
-<br>
-                //===== reconstruction =====<br>
-                X265_CHECK(((intptr_t)residual & (tuSize - 1)) == 0, "residual alignment check failed\n");<br>
-                X265_CHECK(tuSize <= 32, "tuSize out of range\n");<br>
-<br>
-                // use square primitive<br>
-                int part = partitionFromSize(tuSize);<br>
-                primitives.chroma[CHROMA_444].add_ps[part](recon, stride, pred, residual, stride, stride);<br>
-                primitives.chroma[CHROMA_444].copy_pp[part](reconIPred, reconIPredStride, recon, stride);<br>
             }<br>
             while (isNextSection(&tuIterator));<br>
<br>
@@ -1859,7 +1870,6 @@<br>
<br>
 void TEncSearch::initSection(TComTURecurse *tuIterator, uint32_t splitMode, uint32_t absPartIdxStep, uint32_t m_absPartIdxTU)<br>
 {<br>
-    tuIterator->m_partOffset        = 0;<br>
     tuIterator->m_section           = 0;<br>
     tuIterator->m_absPartIdxTURelCU = m_absPartIdxTU;<br>
     tuIterator->m_splitMode         = splitMode;<br>
@@ -1874,16 +1884,21 @@<br>
 {<br>
     uint32_t depth              = cu->getDepth(0);<br>
     uint32_t initTrDepth        = (cu->getPartitionSize(0) != SIZE_2Nx2N) && (cu->getChromaFormat() == CHROMA_444 ? 1 : 0);<br>
-<br>
+    uint32_t tuSize             = cu->getCUSize(0) >> initTrDepth;<br>
     uint32_t splitMode          = (initTrDepth == 0) ? DONT_SPLIT : QUAD_SPLIT;<br>
     uint32_t absPartIdx         = (cu->getPic()->getNumPartInCU() >> (depth << 1));<br>
<br>
+    int chFmt = cu->getChromaFormat();<br>
+    int part = partitionFromSize(tuSize);<br>
+<br>
     TComTURecurse tuIterator;<br>
<br>
     initSection(&tuIterator, splitMode, absPartIdx);<br>
<br>
     do<br>
     {<br>
+        uint32_t absPartIdxC = tuIterator.m_absPartIdxTURelCU;<br>
+<br>
         uint32_t bestMode           = 0;<br>
         uint32_t bestDist           = 0;<br>
         uint64_t bestCost           = MAX_INT64;<br>
@@ -1893,9 +1908,7 @@<br>
         uint32_t maxMode = NUM_CHROMA_MODE;<br>
         uint32_t modeList[NUM_CHROMA_MODE];<br>
<br>
-        tuIterator.m_partOffset = tuIterator.m_absPartIdxTURelCU;<br>
-<br>
-        cu->getAllowedChromaDir(tuIterator.m_partOffset, modeList);<br>
+        cu->getAllowedChromaDir(absPartIdxC, modeList);<br>
<br>
         //----- check chroma modes -----<br>
         for (uint32_t mode = minMode; mode < maxMode; mode++)<br>
@@ -1906,16 +1919,16 @@<br>
             //----- chroma coding -----<br>
             uint32_t dist = 0;<br>
<br>
-            cu->setChromIntraDirSubParts(modeList[mode], tuIterator.m_partOffset, depth + initTrDepth);<br>
-<br>
-            xRecurIntraChromaCodingQT(cu, initTrDepth, tuIterator.m_absPartIdxTURelCU, fencYuv, predYuv, resiYuv, dist);<br>
+            cu->setChromIntraDirSubParts(modeList[mode], absPartIdxC, depth + initTrDepth);<br>
+<br>
+            xRecurIntraChromaCodingQT(cu, initTrDepth, absPartIdxC, fencYuv, predYuv, resiYuv, dist);<br>
<br>
             if (cu->getSlice()->getPPS()->getUseTransformSkip())<br>
             {<br>
                 m_rdGoOnSbacCoder->load(m_rdSbacCoders[depth][CI_CURR_BEST]);<br>
             }<br>
<br>
-            uint32_t bits = xGetIntraBitsQT(cu, initTrDepth, tuIterator.m_absPartIdxTURelCU, tuIterator.m_absPartIdxStep, false, true);<br>
+            uint32_t bits = xGetIntraBitsQT(cu, initTrDepth, absPartIdxC, tuIterator.m_absPartIdxStep, false, true);<br>
             uint64_t cost = m_rdCost->calcRdCost(dist, bits);<br>
<br>
             //----- compare -----<br>
@@ -1924,37 +1937,36 @@<br>
                 bestCost = cost;<br>
                 bestDist = dist;<br>
                 bestMode = modeList[mode];<br>
-                xSetIntraResultChromaQT(cu, initTrDepth, tuIterator.m_absPartIdxTURelCU, reconYuv);<br>
-                ::memcpy(m_qtTempCbf[1], cu->getCbf(TEXT_CHROMA_U) + tuIterator.m_partOffset, tuIterator.m_absPartIdxStep * sizeof(uint8_t));<br>
-                ::memcpy(m_qtTempCbf[2], cu->getCbf(TEXT_CHROMA_V) + tuIterator.m_partOffset, tuIterator.m_absPartIdxStep * sizeof(uint8_t));<br>
-                ::memcpy(m_qtTempTransformSkipFlag[1], cu->getTransformSkip(TEXT_CHROMA_U) + tuIterator.m_partOffset, tuIterator.m_absPartIdxStep * sizeof(uint8_t));<br>
-                ::memcpy(m_qtTempTransformSkipFlag[2], cu->getTransformSkip(TEXT_CHROMA_V) + tuIterator.m_partOffset, tuIterator.m_absPartIdxStep * sizeof(uint8_t));<br>
+                xSetIntraResultChromaQT(cu, initTrDepth, absPartIdxC, reconYuv);<br>
+                ::memcpy(m_qtTempCbf[1], cu->getCbf(TEXT_CHROMA_U) + absPartIdxC, tuIterator.m_absPartIdxStep * sizeof(uint8_t));<br>
+                ::memcpy(m_qtTempCbf[2], cu->getCbf(TEXT_CHROMA_V) + absPartIdxC, tuIterator.m_absPartIdxStep * sizeof(uint8_t));<br>
+                ::memcpy(m_qtTempTransformSkipFlag[1], cu->getTransformSkip(TEXT_CHROMA_U) + absPartIdxC, tuIterator.m_absPartIdxStep * sizeof(uint8_t));<br>
+                ::memcpy(m_qtTempTransformSkipFlag[2], cu->getTransformSkip(TEXT_CHROMA_V) + absPartIdxC, tuIterator.m_absPartIdxStep * sizeof(uint8_t));<br>
             }<br>
         }<br>
<br>
         if (!isLastSection(&tuIterator))<br>
         {<br>
-            uint32_t compWidth   = (cu->getCUSize(0) >> m_hChromaShift) >> initTrDepth;<br>
-            uint32_t compHeight  = (cu->getCUSize(0) >> m_vChromaShift) >> initTrDepth;<br>
-            uint32_t zorder      = cu->getZorderIdxInCU() + tuIterator.m_partOffset;<br>
-            pixel*     dst         = cu->getPic()->getPicYuvRec()->getCbAddr(cu->getAddr(), zorder);<br>
+            uint32_t zorder      = cu->getZorderIdxInCU() + absPartIdxC;<br>
             uint32_t dststride   = cu->getPic()->getPicYuvRec()->getCStride();<br>
-            pixel*     src         = reconYuv->getCbAddr(tuIterator.m_partOffset);<br>
             uint32_t srcstride   = reconYuv->getCStride();<br>
-<br>
-            primitives.blockcpy_pp(compWidth, compHeight, dst, dststride, src, srcstride);<br>
-<br>
-            dst                 = cu->getPic()->getPicYuvRec()->getCrAddr(cu->getAddr(), zorder);<br>
-            src                 = reconYuv->getCrAddr(tuIterator.m_partOffset);<br>
-            primitives.blockcpy_pp(compWidth, compHeight, dst, dststride, src, srcstride);<br>
+            pixel *src, *dst;<br>
+<br>
+            dst = cu->getPic()->getPicYuvRec()->getCbAddr(cu->getAddr(), zorder);<br>
+            src = reconYuv->getCbAddr(absPartIdxC);<br>
+            primitives.chroma[chFmt].copy_pp[part](dst, dststride, src, srcstride);<br>
+<br>
+            dst = cu->getPic()->getPicYuvRec()->getCrAddr(cu->getAddr(), zorder);<br>
+            src = reconYuv->getCrAddr(absPartIdxC);<br>
+            primitives.chroma[chFmt].copy_pp[part](dst, dststride, src, srcstride);<br>
         }<br>
<br>
         //----- set data -----<br>
-        ::memcpy(cu->getCbf(TEXT_CHROMA_U) + tuIterator.m_partOffset, m_qtTempCbf[1], tuIterator.m_absPartIdxStep * sizeof(uint8_t));<br>
-        ::memcpy(cu->getCbf(TEXT_CHROMA_V) + tuIterator.m_partOffset, m_qtTempCbf[2], tuIterator.m_absPartIdxStep * sizeof(uint8_t));<br>
-        ::memcpy(cu->getTransformSkip(TEXT_CHROMA_U) + tuIterator.m_partOffset, m_qtTempTransformSkipFlag[1], tuIterator.m_absPartIdxStep * sizeof(uint8_t));<br>
-        ::memcpy(cu->getTransformSkip(TEXT_CHROMA_V) + tuIterator.m_partOffset, m_qtTempTransformSkipFlag[2], tuIterator.m_absPartIdxStep * sizeof(uint8_t));<br>
-        cu->setChromIntraDirSubParts(bestMode, tuIterator.m_partOffset, depth + initTrDepth);<br>
+        ::memcpy(cu->getCbf(TEXT_CHROMA_U) + absPartIdxC, m_qtTempCbf[1], tuIterator.m_absPartIdxStep * sizeof(uint8_t));<br>
+        ::memcpy(cu->getCbf(TEXT_CHROMA_V) + absPartIdxC, m_qtTempCbf[2], tuIterator.m_absPartIdxStep * sizeof(uint8_t));<br>
+        ::memcpy(cu->getTransformSkip(TEXT_CHROMA_U) + absPartIdxC, m_qtTempTransformSkipFlag[1], tuIterator.m_absPartIdxStep * sizeof(uint8_t));<br>
+        ::memcpy(cu->getTransformSkip(TEXT_CHROMA_V) + absPartIdxC, m_qtTempTransformSkipFlag[2], tuIterator.m_absPartIdxStep * sizeof(uint8_t));<br>
+        cu->setChromIntraDirSubParts(bestMode, absPartIdxC, depth + initTrDepth);<br>
         cu->m_totalDistortion += bestDist;<br>
     }<br>
     while (isNextSection(&tuIterator));<br>
@@ -2685,9 +2697,11 @@<br>
         ::memset(cu->getCbf(TEXT_LUMA), 0, qpartnum * sizeof(uint8_t));<br>
         ::memset(cu->getCbf(TEXT_CHROMA_U), 0, qpartnum * sizeof(uint8_t));<br>
         ::memset(cu->getCbf(TEXT_CHROMA_V), 0, qpartnum * sizeof(uint8_t));<br>
+#if CHECKED_BUILD || _DEBUG<br>
         ::memset(cu->getCoeffY(), 0, cuSize * cuSize * sizeof(coeff_t));<br>
         ::memset(cu->getCoeffCb(), 0, cuSize * cuSize * sizeof(coeff_t) >> (m_hChromaShift + m_vChromaShift));<br>
         ::memset(cu->getCoeffCr(), 0, cuSize * cuSize * sizeof(coeff_t) >> (m_hChromaShift + m_vChromaShift));<br>
+#endif<br>
         cu->setTransformSkipSubParts(0, 0, 0, 0, cu->getDepth(0));<br>
     }<br>
     else<br>
@@ -2841,25 +2855,26 @@<br>
         m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA, QP_BD_OFFSET, 0, chFmt);<br>
         m_trQuant->selectLambda(TEXT_LUMA);<br>
<br>
-        absSumY = m_trQuant->transformNxN(cu, resiYuv->getLumaAddr(absPartIdx), resiYuv->m_width, coeffCurY,<br>
+        int16_t *curResiY = resiYuv->getLumaAddr(absPartIdx);<br>
+        const uint32_t strideResiY = resiYuv->m_width;<br>
+        const uint32_t strideResiC = resiYuv->m_cwidth;<br>
+<br>
+        absSumY = m_trQuant->transformNxN(cu, curResiY, strideResiY, coeffCurY,<br>
                                           trSize, TEXT_LUMA, absPartIdx, &lastPosY, false, curuseRDOQ);<br>
<br>
         cu->setCbfSubParts(absSumY ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);<br>
<br>
         if (absSumY)<br>
         {<br>
-            int16_t *curResiY = resiYuv->getLumaAddr(absPartIdx);<br>
-<br>
             m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA, QP_BD_OFFSET, 0, chFmt);<br>
<br>
             int scalingListType = 3 + TEXT_LUMA;<br>
             X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);<br>
-            m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), REG_DCT, curResiY, resiYuv->m_width,  coeffCurY, trSize, scalingListType, false, lastPosY); //this is for inter mode only<br>
+            m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), REG_DCT, curResiY, strideResiY,  coeffCurY, trSize, scalingListType, false, lastPosY); //this is for inter mode only<br>
         }<br>
         else<br>
         {<br>
-            int16_t *ptr = resiYuv->getLumaAddr(absPartIdx);<br>
-            primitives.blockfill_s[sizeIdx](ptr, resiYuv->m_width, 0);<br>
+            primitives.blockfill_s[sizeIdx](curResiY, strideResiY, 0);<br>
         }<br>
         cu->setCbfSubParts(absSumY ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);<br>
<br>
@@ -2873,6 +2888,9 @@<br>
                 uint32_t absPartIdxC = tuIterator.m_absPartIdxTURelCU;<br>
                 uint32_t subTUBufferOffset = trSizeC * trSizeC * tuIterator.m_section;<br>
<br>
+                int16_t *curResiU = resiYuv->getCbAddr(absPartIdxC);<br>
+                int16_t *curResiV = resiYuv->getCrAddr(absPartIdxC);<br>
+<br>
                 cu->setTransformSkipPartRange(0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);<br>
                 cu->setTransformSkipPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);<br>
<br>
@@ -2881,12 +2899,12 @@<br>
<br>
                 m_trQuant->selectLambda(TEXT_CHROMA);<br>
<br>
-                absSumU = m_trQuant->transformNxN(cu, resiYuv->getCbAddr(absPartIdxC), resiYuv->m_cwidth, coeffCurU + subTUBufferOffset,<br>
+                absSumU = m_trQuant->transformNxN(cu, curResiU, strideResiC, coeffCurU + subTUBufferOffset,<br>
                                                   trSizeC, TEXT_CHROMA_U, absPartIdxC, &lastPosU, false, curuseRDOQ);<br>
<br>
                 curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCrQpOffset() + cu->getSlice()->getSliceQpDeltaCr();<br>
                 m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);<br>
-                absSumV = m_trQuant->transformNxN(cu, resiYuv->getCrAddr(absPartIdxC), resiYuv->m_cwidth, coeffCurV + subTUBufferOffset,<br>
+                absSumV = m_trQuant->transformNxN(cu, curResiV, strideResiC, coeffCurV + subTUBufferOffset,<br>
                                                   trSizeC, TEXT_CHROMA_V, absPartIdxC, &lastPosV, false, curuseRDOQ);<br>
<br>
                 cu->setCbfPartRange(absSumU ? setCbf : 0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);<br>
@@ -2894,34 +2912,29 @@<br>
<br>
                 if (absSumU)<br>
                 {<br>
-                    int16_t *pcResiCurrU = resiYuv->getCbAddr(absPartIdxC);<br>
-<br>
                     curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCbQpOffset() + cu->getSlice()->getSliceQpDeltaCb();<br>
                     m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);<br>
<br>
                     int scalingListType = 3 + TEXT_CHROMA_U;<br>
                     X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);<br>
-                    m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, pcResiCurrU, resiYuv->m_cwidth, coeffCurU + subTUBufferOffset, trSizeC, scalingListType, false, lastPosU);<br>
+                    m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, curResiU, strideResiC, coeffCurU + subTUBufferOffset, trSizeC, scalingListType, false, lastPosU);<br>
                 }<br>
                 else<br>
                 {<br>
-                    int16_t *ptr = resiYuv->getCbAddr(absPartIdxC);<br>
-                    primitives.blockfill_s[sizeIdxC](ptr, resiYuv->m_cwidth, 0);<br>
+                    primitives.blockfill_s[sizeIdxC](curResiU, strideResiC, 0);<br>
                 }<br>
                 if (absSumV)<br>
                 {<br>
-                    int16_t *curResiV = resiYuv->getCrAddr(absPartIdxC);<br>
                     curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCrQpOffset() + cu->getSlice()->getSliceQpDeltaCr();<br>
                     m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);<br>
<br>
                     int scalingListType = 3 + TEXT_CHROMA_V;<br>
                     X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);<br>
-                    m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, curResiV, resiYuv->m_cwidth, coeffCurV + subTUBufferOffset, trSizeC, scalingListType, false, lastPosV);<br>
+                    m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, curResiV, strideResiC, coeffCurV + subTUBufferOffset, trSizeC, scalingListType, false, lastPosV);<br>
                 }<br>
                 else<br>
                 {<br>
-                    int16_t *ptr = resiYuv->getCrAddr(absPartIdxC);<br>
-                    primitives.blockfill_s[sizeIdxC](ptr, resiYuv->m_cwidth, 0);<br>
+                    primitives.blockfill_s[sizeIdxC](curResiV, strideResiC, 0);<br>
                 }<br>
                 cu->setCbfPartRange(absSumU ? setCbf : 0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);<br>
                 cu->setCbfPartRange(absSumV ? setCbf : 0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);<br>
@@ -3027,6 +3040,8 @@<br>
     if (bCheckFull)<br>
     {<br>
         uint32_t trSizeC = 1 << trSizeCLog2;<br>
+        int sizeIdx  = trSizeLog2 - 2;<br>
+        int sizeIdxC = trSizeCLog2 - 2;<br>
         const uint32_t qtlayer = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - trSizeLog2;<br>
         uint32_t coeffOffsetY = absPartIdx << cu->getPic()->getLog2UnitSize() * 2;<br>
         uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);<br>
@@ -3070,7 +3085,7 @@<br>
             do<br>
             {<br>
                 uint32_t absPartIdxC = tuIterator.m_absPartIdxTURelCU;<br>
-                uint32_t subTUBufferOffset    = trSizeC * trSizeC * tuIterator.m_section;<br>
+                uint32_t subTUBufferOffset = trSizeC * trSizeC * tuIterator.m_section;<br>
<br>
                 cu->setTransformSkipPartRange(0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);<br>
                 cu->setTransformSkipPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);<br>
@@ -3112,7 +3127,8 @@<br>
             while (isNextSection(&tuIterator));<br>
         }<br>
<br>
-        const uint32_t numSamplesLuma = 1 << (trSizeLog2 << 1);<br>
+        const uint32_t numCoeffY = 1 << (trSizeLog2 * 2);<br>
+        const uint32_t numCoeffC = 1 << (trSizeCLog2 * 2);<br>
<br>
         for (uint32_t subTUIndex = 0; subTUIndex < 2; subTUIndex++)<br>
         {<br>
@@ -3123,6 +3139,10 @@<br>
<br>
         int partSize = partitionFromSize(trSize);<br>
         uint32_t distY = primitives.sse_sp[partSize](resiYuv->getLumaAddr(absPartIdx), resiYuv->m_width, (pixel*)RDCost::zeroPel, trSize);<br>
+        int16_t *curResiY = m_qtTempShortYuv[qtlayer].getLumaAddr(absPartIdx);<br>
+        X265_CHECK(m_qtTempShortYuv[qtlayer].m_width == MAX_CU_SIZE, "width not full CU\n");<br>
+        const uint32_t strideResiY = MAX_CU_SIZE;<br>
+        const uint32_t strideResiC = m_qtTempShortYuv[qtlayer].m_cwidth;<br>
<br>
         if (outZeroDist)<br>
         {<br>
@@ -3130,16 +3150,13 @@<br>
         }<br>
         if (absSum[TEXT_LUMA][0])<br>
         {<br>
-            int16_t *curResiY = m_qtTempShortYuv[qtlayer].getLumaAddr(absPartIdx);<br>
-<br>
             m_trQuant->setQPforQuant(cu->getQP(0), TEXT_LUMA, QP_BD_OFFSET, 0, chFmt);<br>
<br>
             int scalingListType = 3 + TEXT_LUMA;<br>
             X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);<br>
-            X265_CHECK(m_qtTempShortYuv[qtlayer].m_width == MAX_CU_SIZE, "width not full CU\n");<br>
-            m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), REG_DCT, curResiY, MAX_CU_SIZE,  coeffCurY, trSize, scalingListType, false, lastPos[TEXT_LUMA][0]); //this is for inter mode only<br>
-<br>
-            const uint32_t nonZeroDistY = primitives.sse_ss[partSize](resiYuv->getLumaAddr(absPartIdx), resiYuv->m_width, m_qtTempShortYuv[qtlayer].getLumaAddr(absPartIdx), MAX_CU_SIZE);<br>
+            m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), REG_DCT, curResiY, strideResiY,  coeffCurY, trSize, scalingListType, false, lastPos[TEXT_LUMA][0]); //this is for inter mode only<br>
+<br>
+            const uint32_t nonZeroDistY = primitives.sse_ss[partSize](resiYuv->getLumaAddr(absPartIdx), resiYuv->m_width, curResiY, strideResiY);<br>
             if (cu->isLosslessCoded(0))<br>
             {<br>
                 distY = nonZeroDistY;<br>
@@ -3154,7 +3171,9 @@<br>
                 if (nullCostY < singleCostY)<br>
                 {<br>
                     absSum[TEXT_LUMA][0] = 0;<br>
-                    ::memset(coeffCurY, 0, sizeof(coeff_t) * numSamplesLuma);<br>
+#if CHECKED_BUILD || _DEBUG<br>
+                    ::memset(coeffCurY, 0, sizeof(coeff_t) * numCoeffY);<br>
+#endif<br>
                     if (checkTransformSkipY)<br>
                     {<br>
                         minCost[TEXT_LUMA][0] = nullCostY;<br>
@@ -3182,10 +3201,7 @@<br>
<br>
         if (!absSum[TEXT_LUMA][0])<br>
         {<br>
-            int16_t *ptr =  m_qtTempShortYuv[qtlayer].getLumaAddr(absPartIdx);<br>
-            X265_CHECK(m_qtTempShortYuv[qtlayer].m_width == MAX_CU_SIZE, "width not full CU\n");<br>
-            int sizeIdx = trSizeLog2 - 2;<br>
-            primitives.blockfill_s[sizeIdx](ptr, MAX_CU_SIZE, 0);<br>
+            primitives.blockfill_s[sizeIdx](curResiY, strideResiY, 0);<br>
         }<br>
         cu->setCbfSubParts(absSum[TEXT_LUMA][0] ? setCbf : 0, TEXT_LUMA, absPartIdx, depth);<br>
<br>
@@ -3197,13 +3213,15 @@<br>
             initSection(&tuIterator, splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);<br>
<br>
             int partSizeC = partitionFromSize(trSizeC);<br>
-            const uint32_t numSamplesChroma = trSizeC * trSizeC;<br>
<br>
             do<br>
             {<br>
                 uint32_t absPartIdxC = tuIterator.m_absPartIdxTURelCU;<br>
                 uint32_t subTUBufferOffset = trSizeC * trSizeC * tuIterator.m_section;<br>
<br>
+                int16_t *curResiU = m_qtTempShortYuv[qtlayer].getCbAddr(absPartIdxC);<br>
+                int16_t *curResiV = m_qtTempShortYuv[qtlayer].getCrAddr(absPartIdxC);<br>
+<br>
                 distU = m_rdCost->scaleChromaDistCb(primitives.sse_sp[partSizeC](resiYuv->getCbAddr(absPartIdxC), resiYuv->m_cwidth, (pixel*)RDCost::zeroPel, trSizeC));<br>
<br>
                 if (outZeroDist)<br>
@@ -3212,18 +3230,15 @@<br>
                 }<br>
                 if (absSum[TEXT_CHROMA_U][tuIterator.m_section])<br>
                 {<br>
-                    int16_t *pcResiCurrU = m_qtTempShortYuv[qtlayer].getCbAddr(absPartIdxC);<br>
-<br>
                     int curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCbQpOffset() + cu->getSlice()->getSliceQpDeltaCb();<br>
                     m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);<br>
<br>
                     int scalingListType = 3 + TEXT_CHROMA_U;<br>
                     X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);<br>
-                    m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, pcResiCurrU, m_qtTempShortYuv[qtlayer].m_cwidth, coeffCurU + subTUBufferOffset,<br>
+                    m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, curResiU, strideResiC, coeffCurU + subTUBufferOffset,<br>
                                                trSizeC, scalingListType, false, lastPos[TEXT_CHROMA_U][tuIterator.m_section]);<br>
                     uint32_t dist = primitives.sse_ss[partSizeC](resiYuv->getCbAddr(absPartIdxC), resiYuv->m_cwidth,<br>
-                                                                 m_qtTempShortYuv[qtlayer].getCbAddr(absPartIdxC),<br>
-                                                                 m_qtTempShortYuv[qtlayer].m_cwidth);<br>
+                                                                 curResiU, strideResiC);<br>
                     const uint32_t nonZeroDistU = m_rdCost->scaleChromaDistCb(dist);<br>
<br>
                     if (cu->isLosslessCoded(0))<br>
@@ -3240,7 +3255,9 @@<br>
                         if (nullCostU < singleCostU)<br>
                         {<br>
                             absSum[TEXT_CHROMA_U][tuIterator.m_section] = 0;<br>
-                            ::memset(coeffCurU + subTUBufferOffset, 0, sizeof(coeff_t) * numSamplesChroma);<br>
+#if CHECKED_BUILD || _DEBUG<br>
+                            ::memset(coeffCurU + subTUBufferOffset, 0, sizeof(coeff_t) * numCoeffC);<br>
+#endif<br>
                             if (checkTransformSkipUV)<br>
                             {<br>
                                 minCost[TEXT_CHROMA_U][tuIterator.m_section] = nullCostU;<br>
@@ -3268,10 +3285,7 @@<br>
<br>
                 if (!absSum[TEXT_CHROMA_U][tuIterator.m_section])<br>
                 {<br>
-                    int16_t *ptr = m_qtTempShortYuv[qtlayer].getCbAddr(absPartIdxC);<br>
-                    const uint32_t stride = m_qtTempShortYuv[qtlayer].m_cwidth;<br>
-                    int sizeIdxC = trSizeCLog2 - 2;<br>
-                    primitives.blockfill_s[sizeIdxC](ptr, stride, 0);<br>
+                    primitives.blockfill_s[sizeIdxC](curResiU, strideResiC, 0);<br>
                 }<br>
<br>
                 distV = m_rdCost->scaleChromaDistCr(primitives.sse_sp[partSizeC](resiYuv->getCrAddr(absPartIdxC), resiYuv->m_cwidth, (pixel*)RDCost::zeroPel, trSizeC));<br>
@@ -3281,17 +3295,15 @@<br>
                 }<br>
                 if (absSum[TEXT_CHROMA_V][tuIterator.m_section])<br>
                 {<br>
-                    int16_t *curResiV = m_qtTempShortYuv[qtlayer].getCrAddr(absPartIdxC);<br>
                     int curChromaQpOffset = cu->getSlice()->getPPS()->getChromaCrQpOffset() + cu->getSlice()->getSliceQpDeltaCr();<br>
                     m_trQuant->setQPforQuant(cu->getQP(0), TEXT_CHROMA, cu->getSlice()->getSPS()->getQpBDOffsetC(), curChromaQpOffset, chFmt);<br>
<br>
                     int scalingListType = 3 + TEXT_CHROMA_V;<br>
                     X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);<br>
-                    m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, curResiV, m_qtTempShortYuv[qtlayer].m_cwidth, coeffCurV + subTUBufferOffset,<br>
+                    m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, curResiV, strideResiC, coeffCurV + subTUBufferOffset,<br>
                                                trSizeC, scalingListType, false, lastPos[TEXT_CHROMA_V][tuIterator.m_section]);<br>
                     uint32_t dist = primitives.sse_ss[partSizeC](resiYuv->getCrAddr(absPartIdxC), resiYuv->m_cwidth,<br>
-                                                                 m_qtTempShortYuv[qtlayer].getCrAddr(absPartIdxC),<br>
-                                                                 m_qtTempShortYuv[qtlayer].m_cwidth);<br>
+                                                                 curResiV, strideResiC);<br>
                     const uint32_t nonZeroDistV = m_rdCost->scaleChromaDistCr(dist);<br>
<br>
                     if (cu->isLosslessCoded(0))<br>
@@ -3308,7 +3320,9 @@<br>
                         if (nullCostV < singleCostV)<br>
                         {<br>
                             absSum[TEXT_CHROMA_V][tuIterator.m_section] = 0;<br>
-                            ::memset(coeffCurV + subTUBufferOffset, 0, sizeof(coeff_t) * numSamplesChroma);<br>
+#if CHECKED_BUILD || _DEBUG<br>
+                            ::memset(coeffCurV + subTUBufferOffset, 0, sizeof(coeff_t) * numCoeffC);<br>
+#endif<br>
                             if (checkTransformSkipUV)<br>
                             {<br>
                                 minCost[TEXT_CHROMA_V][tuIterator.m_section] = nullCostV;<br>
@@ -3336,10 +3350,7 @@<br>
<br>
                 if (!absSum[TEXT_CHROMA_V][tuIterator.m_section])<br>
                 {<br>
-                    int16_t *ptr =  m_qtTempShortYuv[qtlayer].getCrAddr(absPartIdxC);<br>
-                    const uint32_t stride = m_qtTempShortYuv[qtlayer].m_cwidth;<br>
-                    int sizeIdxC = trSizeCLog2 - 2;<br>
-                    primitives.blockfill_s[sizeIdxC](ptr, stride, 0);<br>
+                    primitives.blockfill_s[sizeIdxC](curResiV, strideResiC, 0);<br>
                 }<br>
<br>
                 cu->setCbfPartRange(absSum[TEXT_CHROMA_U][tuIterator.m_section] ? setCbf : 0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);<br>
@@ -3354,17 +3365,11 @@<br>
             uint32_t nonZeroDistY = 0, absSumTransformSkipY;<br>
             uint64_t singleCostY = MAX_INT64;<br>
<br>
-            int16_t *curResiY = m_qtTempShortYuv[qtlayer].getLumaAddr(absPartIdx);<br>
-            X265_CHECK(m_qtTempShortYuv[qtlayer].m_width == MAX_CU_SIZE, "width not full CU\n");<br>
-<br>
-            coeff_t bestCoeffY[32 * 32];<br>
-            memcpy(bestCoeffY, coeffCurY, sizeof(coeff_t) * numSamplesLuma);<br>
-<br>
-            int16_t bestResiY[32 * 32];<br>
-            for (int i = 0; i < trSize; ++i)<br>
-            {<br>
-                memcpy(bestResiY + i * trSize, curResiY + i * MAX_CU_SIZE, sizeof(int16_t) * trSize);<br>
-            }<br>
+            coeff_t bestCoeffY[MAX_TS_SIZE * MAX_TS_SIZE];<br>
+            memcpy(bestCoeffY, coeffCurY, sizeof(coeff_t) * numCoeffY);<br>
+<br>
+            int16_t bestResiY[MAX_TS_SIZE * MAX_TS_SIZE];<br>
+            primitives.square_copy_ss[sizeIdx](bestResiY, trSize, curResiY, strideResiY);<br>
<br>
             m_rdGoOnSbacCoder->load(m_rdSbacCoders[depth][CI_QT_TRAFO_ROOT]);<br>
<br>
@@ -3393,13 +3398,11 @@<br>
<br>
                 int scalingListType = 3 + TEXT_LUMA;<br>
                 X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);<br>
-                X265_CHECK(m_qtTempShortYuv[qtlayer].m_width == MAX_CU_SIZE, "width not full CU\n");<br>
-<br>
-                m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), REG_DCT, curResiY, MAX_CU_SIZE,  coeffCurY, trSize, scalingListType, true, lastPosTransformSkip[TEXT_LUMA][0]);<br>
+<br>
+                m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdx), REG_DCT, curResiY, strideResiY,  coeffCurY, trSize, scalingListType, true, lastPosTransformSkip[TEXT_LUMA][0]);<br>
<br>
                 nonZeroDistY = primitives.sse_ss[partSize](resiYuv->getLumaAddr(absPartIdx), resiYuv->m_width,<br>
-                                                           m_qtTempShortYuv[qtlayer].getLumaAddr(absPartIdx),<br>
-                                                           MAX_CU_SIZE);<br>
+                                                           curResiY, strideResiY);<br>
<br>
                 singleCostY = m_rdCost->calcRdCost(nonZeroDistY, skipSingleBitsY);<br>
             }<br>
@@ -3407,11 +3410,8 @@<br>
             if (!absSumTransformSkipY || minCost[TEXT_LUMA][0] < singleCostY)<br>
             {<br>
                 cu->setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, depth);<br>
-                memcpy(coeffCurY, bestCoeffY, sizeof(coeff_t) * numSamplesLuma);<br>
-                for (int i = 0; i < trSize; ++i)<br>
-                {<br>
-                    memcpy(curResiY + i * MAX_CU_SIZE, &bestResiY[i * trSize], sizeof(int16_t) * trSize);<br>
-                }<br>
+                memcpy(coeffCurY, bestCoeffY, sizeof(coeff_t) * numCoeffY);<br>
+                primitives.square_copy_ss[sizeIdx](curResiY, strideResiY, bestResiY, trSize);<br>
             }<br>
             else<br>
             {<br>
@@ -3435,7 +3435,6 @@<br>
             initSection(&tuIterator, splitIntoSubTUs ? VERTICAL_SPLIT : DONT_SPLIT, absPartIdxStep, absPartIdx);<br>
<br>
             int partSizeC = partitionFromSize(trSizeC);<br>
-            const uint32_t numSamplesChroma = trSizeC * trSizeC;<br>
<br>
             do<br>
             {<br>
@@ -3444,18 +3443,14 @@<br>
<br>
                 int16_t *curResiU = m_qtTempShortYuv[qtlayer].getCbAddr(absPartIdxC);<br>
                 int16_t *curResiV = m_qtTempShortYuv[qtlayer].getCrAddr(absPartIdxC);<br>
-                uint32_t stride = m_qtTempShortYuv[qtlayer].m_cwidth;<br>
-<br>
-                coeff_t bestCoeffU[32 * 32], bestCoeffV[32 * 32];<br>
-                memcpy(bestCoeffU, coeffCurU + subTUBufferOffset, sizeof(coeff_t) * numSamplesChroma);<br>
-                memcpy(bestCoeffV, coeffCurV + subTUBufferOffset, sizeof(coeff_t) * numSamplesChroma);<br>
-<br>
-                int16_t bestResiU[32 * 32], bestResiV[32 * 32];<br>
-                for (int i = 0; i < trSizeC; ++i)<br>
-                {<br>
-                    memcpy(&bestResiU[i * trSizeC], curResiU + i * stride, sizeof(int16_t) * trSizeC);<br>
-                    memcpy(&bestResiV[i * trSizeC], curResiV + i * stride, sizeof(int16_t) * trSizeC);<br>
-                }<br>
+<br>
+                coeff_t bestCoeffU[MAX_TS_SIZE * MAX_TS_SIZE], bestCoeffV[MAX_TS_SIZE * MAX_TS_SIZE];<br>
+                memcpy(bestCoeffU, coeffCurU + subTUBufferOffset, sizeof(coeff_t) * numCoeffC);<br>
+                memcpy(bestCoeffV, coeffCurV + subTUBufferOffset, sizeof(coeff_t) * numCoeffC);<br>
+<br>
+                int16_t bestResiU[MAX_TS_SIZE * MAX_TS_SIZE], bestResiV[MAX_TS_SIZE * MAX_TS_SIZE];<br>
+                primitives.square_copy_ss[sizeIdxC](bestResiU, trSizeC, curResiU, strideResiC);<br>
+                primitives.square_copy_ss[sizeIdxC](bestResiV, trSizeC, curResiV, strideResiC);<br>
<br>
                 cu->setTransformSkipPartRange(1, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);<br>
                 cu->setTransformSkipPartRange(1, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);<br>
@@ -3493,11 +3488,10 @@<br>
<br>
                     int scalingListType = 3 + TEXT_CHROMA_U;<br>
                     X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);<br>
-                    m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, curResiU, m_qtTempShortYuv[qtlayer].m_cwidth, coeffCurU + subTUBufferOffset,<br>
+                    m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, curResiU, strideResiC, coeffCurU + subTUBufferOffset,<br>
                                                trSizeC, scalingListType, true, lastPosTransformSkip[TEXT_CHROMA_U][tuIterator.m_section]);<br>
                     uint32_t dist = primitives.sse_ss[partSizeC](resiYuv->getCbAddr(absPartIdxC), resiYuv->m_cwidth,<br>
-                                                                 m_qtTempShortYuv[qtlayer].getCbAddr(absPartIdxC),<br>
-                                                                 m_qtTempShortYuv[qtlayer].m_cwidth);<br>
+                                                                 curResiU, strideResiC);<br>
                     nonZeroDistU = m_rdCost->scaleChromaDistCb(dist);<br>
                     singleCostU = m_rdCost->calcRdCost(nonZeroDistU, singleBitsComp[TEXT_CHROMA_U][tuIterator.m_section]);<br>
                 }<br>
@@ -3506,11 +3500,8 @@<br>
                 {<br>
                     cu->setTransformSkipPartRange(0, TEXT_CHROMA_U, absPartIdxC, tuIterator.m_absPartIdxStep);<br>
<br>
-                    memcpy(coeffCurU + subTUBufferOffset, bestCoeffU, sizeof(coeff_t) * numSamplesChroma);<br>
-                    for (int i = 0; i < trSizeC; ++i)<br>
-                    {<br>
-                        memcpy(curResiU + i * stride, &bestResiU[i * trSizeC], sizeof(int16_t) * trSizeC);<br>
-                    }<br>
+                    memcpy(coeffCurU + subTUBufferOffset, bestCoeffU, sizeof(coeff_t) * numCoeffC);<br>
+                    primitives.square_copy_ss[sizeIdxC](curResiU, strideResiC, bestResiU, trSizeC);<br>
                 }<br>
                 else<br>
                 {<br>
@@ -3530,11 +3521,10 @@<br>
<br>
                     int scalingListType = 3 + TEXT_CHROMA_V;<br>
                     X265_CHECK(scalingListType < 6, "scalingListType too large %d\n", scalingListType);<br>
-                    m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, curResiV, m_qtTempShortYuv[qtlayer].m_cwidth, coeffCurV + subTUBufferOffset,<br>
+                    m_trQuant->invtransformNxN(cu->getCUTransquantBypass(absPartIdxC), REG_DCT, curResiV, strideResiC, coeffCurV + subTUBufferOffset,<br>
                                                trSizeC, scalingListType, true, lastPosTransformSkip[TEXT_CHROMA_V][tuIterator.m_section]);<br>
                     uint32_t dist = primitives.sse_ss[partSizeC](resiYuv->getCrAddr(absPartIdxC), resiYuv->m_cwidth,<br>
-                                                                 m_qtTempShortYuv[qtlayer].getCrAddr(absPartIdxC),<br>
-                                                                 m_qtTempShortYuv[qtlayer].m_cwidth);<br>
+                                                                 curResiV, strideResiC);<br>
                     nonZeroDistV = m_rdCost->scaleChromaDistCr(dist);<br>
                     singleCostV = m_rdCost->calcRdCost(nonZeroDistV, singleBitsComp[TEXT_CHROMA_V][tuIterator.m_section]);<br>
                 }<br>
@@ -3543,11 +3533,8 @@<br>
                 {<br>
                     cu->setTransformSkipPartRange(0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);<br>
<br>
-                    memcpy(coeffCurV + subTUBufferOffset, bestCoeffV, sizeof(coeff_t) * numSamplesChroma);<br>
-                    for (int i = 0; i < trSizeC; ++i)<br>
-                    {<br>
-                        memcpy(curResiV + i * stride, &bestResiV[i * trSizeC], sizeof(int16_t) * trSizeC);<br>
-                    }<br>
+                    memcpy(coeffCurV + subTUBufferOffset, bestCoeffV, sizeof(coeff_t) * numCoeffC);<br>
+                    primitives.square_copy_ss[sizeIdxC](curResiV, strideResiC, bestResiV, trSizeC);<br>
                 }<br>
                 else<br>
                 {<br>
@@ -3560,6 +3547,7 @@<br>
                 cu->setCbfPartRange(absSum[TEXT_CHROMA_V][tuIterator.m_section] ? setCbf : 0, TEXT_CHROMA_V, absPartIdxC, tuIterator.m_absPartIdxStep);<br>
             }<br>
             while (isNextSection(&tuIterator));<br>
+<br>
         }<br>
<br>
         m_rdGoOnSbacCoder->load(m_rdSbacCoders[depth][CI_QT_TRAFO_ROOT]);<br>
@@ -3929,7 +3917,7 @@<br>
<br>
             if (bCodeChroma)<br>
             {<br>
-                m_qtTempShortYuv[qtlayer].copyPartToPartChroma(resiYuv, absPartIdx, 1 << trSizeLog2, (bChromaSame && (chFmt != CHROMA_422)));<br>
+                m_qtTempShortYuv[qtlayer].copyPartToPartChroma(resiYuv, absPartIdx, trSize, (bChromaSame && (chFmt != CHROMA_422)));<br>
             }<br>
         }<br>
         else<br>
diff -r a5998df9b12e -r 73f86312c2e0 source/common/primitives.cpp<br>
--- a/source/common/primitives.cpp      Mon Jun 02 07:36:20 2014 +0530<br>
+++ b/source/common/primitives.cpp      Mon Jun 02 11:44:59 2014 +0900<br>
@@ -55,6 +55,11 @@<br>
     LUMA_4x4,  LUMA_8x8,  255,        LUMA_16x16, 255, 255,        255, LUMA_32x32, 255, 255, 255, 255,        255, 255, 255, LUMA_64x64<br>
 };<br>
<br>
+extern const uint8_t lumaPartitionsFromSquareBlocksTable[] =<br>
+{<br>
+    LUMA_4x4, LUMA_8x8, LUMA_16x16, LUMA_32x32, LUMA_64x64<br>
+};<br>
+<br>
 /* the "authoritative" set of encoder primitives */<br>
 EncoderPrimitives primitives;<br>
<br>
@@ -72,6 +77,31 @@<br>
     Setup_C_IPredPrimitives(p);      // intrapred.cpp<br>
     Setup_C_LoopFilterPrimitives(p); // loopfilter.cpp<br>
 }<br>
+<br>
+static void Setup_Alias_Primitives(EncoderPrimitives &p)<br>
+{<br>
+    /* copy reusable luma primitives to chroma 4:4:4 */<br>
+    for (int i = 0; i < NUM_LUMA_PARTITIONS; i++)<br>
+    {<br>
+        p.chroma[X265_CSP_I444].copy_pp[i] = p.luma_copy_pp[i];<br>
+        p.chroma[X265_CSP_I444].copy_ps[i] = p.luma_copy_ps[i];<br>
+        p.chroma[X265_CSP_I444].copy_sp[i] = p.luma_copy_sp[i];<br>
+        p.chroma[X265_CSP_I444].copy_ss[i] = p.luma_copy_ss[i];<br>
+        p.chroma[X265_CSP_I444].add_ps[i]  = p.luma_add_ps[i];<br>
+        p.chroma[X265_CSP_I444].sub_ps[i]  = p.luma_sub_ps[i];<br>
+        p.chroma[X265_CSP_I444].addAvg[i]  = p.luma_addAvg[i];<br>
+    }<br>
+<br>
+    for (int i = 0; i < NUM_SQUARE_BLOCKS; i++)<br>
+    {<br>
+        int partL = lumaPartitionsFromSquareBlocksTable[i];<br>
+        p.sad_square[i]     = p.sad[partL];<br>
+        p.square_copy_pp[i] = p.luma_copy_pp[partL];<br>
+        p.square_copy_ps[i] = p.luma_copy_ps[partL];<br>
+        p.square_copy_sp[i] = p.luma_copy_sp[partL];<br>
+        p.square_copy_ss[i] = p.luma_copy_ss[partL];<br>
+    }<br>
+}<br>
 }<br>
 using namespace x265;<br>
<br>
@@ -95,6 +125,8 @@<br>
         x265_log(param, X265_LOG_WARNING, "Assembly not supported in this binary\n");<br>
 #endif<br>
<br>
+        Setup_Alias_Primitives(primitives);<br>
+<br>
         initROM();<br>
     }<br>
<br>
diff -r a5998df9b12e -r 73f86312c2e0 source/common/primitives.h<br>
--- a/source/common/primitives.h        Mon Jun 02 07:36:20 2014 +0530<br>
+++ b/source/common/primitives.h        Mon Jun 02 11:44:59 2014 +0900<br>
@@ -213,6 +213,10 @@<br>
     copy_ss_t       luma_copy_ss[NUM_LUMA_PARTITIONS];<br>
     pixel_sub_ps_t  luma_sub_ps[NUM_LUMA_PARTITIONS];<br>
     pixel_add_ps_t  luma_add_ps[NUM_LUMA_PARTITIONS];<br>
+    copy_pp_t       square_copy_pp[NUM_SQUARE_BLOCKS];<br>
+    copy_sp_t       square_copy_sp[NUM_SQUARE_BLOCKS];<br>
+    copy_ps_t       square_copy_ps[NUM_SQUARE_BLOCKS];<br>
+    copy_ss_t       square_copy_ss[NUM_SQUARE_BLOCKS];<br>
<br>
     filter_pp_t     luma_hpp[NUM_LUMA_PARTITIONS];<br>
     filter_hps_t    luma_hps[NUM_LUMA_PARTITIONS];<br>
diff -r a5998df9b12e -r 73f86312c2e0 source/common/x86/asm-primitives.cpp<br>
--- a/source/common/x86/asm-primitives.cpp      Mon Jun 02 07:36:20 2014 +0530<br>
+++ b/source/common/x86/asm-primitives.cpp      Mon Jun 02 11:44:59 2014 +0900<br>
@@ -1316,30 +1316,12 @@<br>
     }<br>
 #endif // if HIGH_BIT_DEPTH<br>
<br>
-    /* copy reusable luma primitives to chroma 4:4:4 */<br>
-    for (int i = 0; i < NUM_LUMA_PARTITIONS; i++)<br>
-    {<br>
-        p.chroma[X265_CSP_I444].copy_pp[i] = p.luma_copy_pp[i];<br>
-        p.chroma[X265_CSP_I444].copy_ps[i] = p.luma_copy_ps[i];<br>
-        p.chroma[X265_CSP_I444].copy_sp[i] = p.luma_copy_sp[i];<br>
-        p.chroma[X265_CSP_I444].copy_ss[i] = p.luma_copy_ss[i];<br>
-        p.chroma[X265_CSP_I444].add_ps[i]  = p.luma_add_ps[i];<br>
-        p.chroma[X265_CSP_I444].sub_ps[i]  = p.luma_sub_ps[i];<br>
-        p.chroma[X265_CSP_I444].addAvg[i]  = p.luma_addAvg[i];<br>
-    }<br>
-<br>
     primitives.sa8d[BLOCK_4x4]   = primitives.sa8d_inter[LUMA_4x4];<br>
     primitives.sa8d[BLOCK_8x8]   = primitives.sa8d_inter[LUMA_8x8];<br>
     primitives.sa8d[BLOCK_16x16] = primitives.sa8d_inter[LUMA_16x16];<br>
     primitives.sa8d[BLOCK_32x32] = primitives.sa8d_inter[LUMA_32x32];<br>
     primitives.sa8d[BLOCK_64x64] = primitives.sa8d_inter[LUMA_64x64];<br>
<br>
-    primitives.sad_square[BLOCK_4x4]   = primitives.sad[LUMA_4x4];<br>
-    primitives.sad_square[BLOCK_8x8]   = primitives.sad[LUMA_8x8];<br>
-    primitives.sad_square[BLOCK_16x16] = primitives.sad[LUMA_16x16];<br>
-    primitives.sad_square[BLOCK_32x32] = primitives.sad[LUMA_32x32];<br>
-    primitives.sad_square[BLOCK_64x64] = primitives.sad[LUMA_64x64];<br>
-<br>
     // SA8D devolves to SATD for blocks not even multiples of 8x8<br>
     primitives.sa8d_inter[LUMA_4x4]   = primitives.satd[LUMA_4x4];<br>
     primitives.sa8d_inter[LUMA_4x8]   = primitives.satd[LUMA_4x8];<br>
_______________________________________________<br>
x265-devel mailing list<br>
<a href="mailto:x265-devel@videolan.org">x265-devel@videolan.org</a><br>
<a href="https://mailman.videolan.org/listinfo/x265-devel" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a><br>
</blockquote></div><br></div>