[x265] [PATCH 4 of 8] search & analysis: modify to support 400 color space

Mon Dec 14 20:30:10 CET 2015

# HG changeset patch
# User Mahesh Pittala <mahesh at multicorewareinc.com>
# Date 1450017438 -19800
#      Sun Dec 13 20:07:18 2015 +0530
# Node ID d7200ee7910d88b970a80c41da1f63ab4ce4166d
# Parent  d01cd1fee4e30e2dd4ea90490e471417e5bf47d5
search & analysis: modify to support 400 color space

diff -r d01cd1fee4e3 -r d7200ee7910d source/encoder/analysis.cpp

--- a/source/encoder/analysis.cpp	Sun Dec 13 19:57:55 2015 +0530
+++ b/source/encoder/analysis.cpp	Sun Dec 13 20:07:18 2015 +0530
@@ -689,7 +689,7 @@
             if (m_param->rdLevel > 2)
             {
                 /* RD selection between merge, inter, bidir and intra */
-                if (!m_bChromaSa8d) /* When m_bChromaSa8d is enabled, chroma MC has already been done */
+                if (!m_bChromaSa8d && (m_csp != X265_CSP_I400)) /* When m_bChromaSa8d is enabled, chroma MC has already been done */
                 {
                     uint32_t numPU = bestInter->cu.getNumPartInter(0);
                     for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
@@ -1098,7 +1098,7 @@
             if (m_param->rdLevel >= 3)
             {
                 /* Calculate RD cost of best inter option */
-                if (!m_bChromaSa8d) /* When m_bChromaSa8d is enabled, chroma MC has already been done */
+                if (!m_bChromaSa8d && (m_csp != X265_CSP_I400)) /* When m_bChromaSa8d is enabled, chroma MC has already been done */
                 {
                     uint32_t numPU = bestInter->cu.getNumPartInter(0);
                     for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
@@ -1173,10 +1173,13 @@
                 else if (md.bestMode->cu.isInter(0))
                 {
                     uint32_t numPU = md.bestMode->cu.getNumPartInter(0);
-                    for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
+                    if (m_csp != X265_CSP_I400)
                     {
-                        PredictionUnit pu(md.bestMode->cu, cuGeom, puIdx);
-                        motionCompensation(md.bestMode->cu, pu, md.bestMode->predYuv, false, true);
+                        for (uint32_t puIdx = 0; puIdx < numPU; puIdx++)
+                        {
+                            PredictionUnit pu(md.bestMode->cu, cuGeom, puIdx);
+                            motionCompensation(md.bestMode->cu, pu, md.bestMode->predYuv, false, true);
+                        }
                     }
                     if (m_param->rdLevel == 2)
                         encodeResAndCalcRdInterCU(*md.bestMode, cuGeom);
@@ -1187,7 +1190,6 @@
 
                         uint32_t tuDepthRange[2];
                         cu.getInterTUQtDepthRange(tuDepthRange, 0);
-
                         m_rqt[cuGeom.depth].tmpResiYuv.subtract(*md.bestMode->fencYuv, md.bestMode->predYuv, cuGeom.log2CUSize);
                         residualTransformQuantInter(*md.bestMode, cuGeom, 0, 0, tuDepthRange);
                         if (cu.getQtRootCbf(0))
@@ -1213,8 +1215,11 @@
                         cu.getIntraTUQtDepthRange(tuDepthRange, 0);
 
                         residualTransformQuantIntra(*md.bestMode, cuGeom, 0, 0, tuDepthRange);
-                        getBestIntraModeChroma(*md.bestMode, cuGeom);
-                        residualQTIntraChroma(*md.bestMode, cuGeom, 0, 0);
+                        if (m_csp != X265_CSP_I400)
+                        {
+                            getBestIntraModeChroma(*md.bestMode, cuGeom);
+                            residualQTIntraChroma(*md.bestMode, cuGeom, 0, 0);
+                        }
                         md.bestMode->reconYuv.copyFromPicYuv(reconPic, cu.m_cuAddr, cuGeom.absPartIdx); // TODO:
                     }
                 }
@@ -1701,12 +1706,11 @@
         tempPred->cu.m_mv[1][0] = candMvField[i][1].mv;
         tempPred->cu.m_refIdx[0][0] = (int8_t)candMvField[i][0].refIdx;
         tempPred->cu.m_refIdx[1][0] = (int8_t)candMvField[i][1].refIdx;
-
-        motionCompensation(tempPred->cu, pu, tempPred->predYuv, true, m_bChromaSa8d);
+        motionCompensation(tempPred->cu, pu, tempPred->predYuv, true, m_bChromaSa8d && (m_csp != X265_CSP_I400));
 
         tempPred->sa8dBits = getTUBits(i, numMergeCand);
         tempPred->distortion = primitives.cu[sizeIdx].sa8d(fencYuv->m_buf[0], fencYuv->m_size, tempPred->predYuv.m_buf[0], tempPred->predYuv.m_size);
-        if (m_bChromaSa8d)
+        if (m_bChromaSa8d && (m_csp != X265_CSP_I400))
         {
             tempPred->distortion += primitives.chroma[m_csp].cu[sizeIdx].sa8d(fencYuv->m_buf[1], fencYuv->m_csize, tempPred->predYuv.m_buf[1], tempPred->predYuv.m_csize);
             tempPred->distortion += primitives.chroma[m_csp].cu[sizeIdx].sa8d(fencYuv->m_buf[2], fencYuv->m_csize, tempPred->predYuv.m_buf[2], tempPred->predYuv.m_csize);
@@ -1725,7 +1729,7 @@
         return;
 
     /* calculate the motion compensation for chroma for the best mode selected */
-    if (!m_bChromaSa8d) /* Chroma MC was done above */
+    if (!m_bChromaSa8d && (m_csp != X265_CSP_I400)) /* Chroma MC was done above */
         motionCompensation(bestPred->cu, pu, bestPred->predYuv, false, true);
 
     if (m_param->rdLevel)
@@ -1848,7 +1852,7 @@
         tempPred->cu.m_refIdx[1][0] = (int8_t)candMvField[i][1].refIdx;
         tempPred->cu.setPredModeSubParts(MODE_INTER); /* must be cleared between encode iterations */
 
-        motionCompensation(tempPred->cu, pu, tempPred->predYuv, true, true);
+        motionCompensation(tempPred->cu, pu, tempPred->predYuv, true, m_csp != X265_CSP_I400);
 
         uint8_t hasCbf = true;
         bool swapped = false;
@@ -1935,15 +1939,14 @@
             }
         }
     }
-
-    predInterSearch(interMode, cuGeom, m_bChromaSa8d, refMask);
+    predInterSearch(interMode, cuGeom, m_bChromaSa8d && (m_csp != X265_CSP_I400), refMask);
 
     /* predInterSearch sets interMode.sa8dBits */
     const Yuv& fencYuv = *interMode.fencYuv;
     Yuv& predYuv = interMode.predYuv;
     int part = partitionFromLog2Size(cuGeom.log2CUSize);
     interMode.distortion = primitives.cu[part].sa8d(fencYuv.m_buf[0], fencYuv.m_size, predYuv.m_buf[0], predYuv.m_size);
-    if (m_bChromaSa8d)
+    if (m_bChromaSa8d && (m_csp != X265_CSP_I400))
     {
         interMode.distortion += primitives.chroma[m_csp].cu[part].sa8d(fencYuv.m_buf[1], fencYuv.m_csize, predYuv.m_buf[1], predYuv.m_csize);
         interMode.distortion += primitives.chroma[m_csp].cu[part].sa8d(fencYuv.m_buf[2], fencYuv.m_csize, predYuv.m_buf[2], predYuv.m_csize);
@@ -1992,8 +1995,7 @@
             }
         }
     }
-
-    predInterSearch(interMode, cuGeom, true, refMask);
+    predInterSearch(interMode, cuGeom, m_csp != X265_CSP_I400, refMask);
 
     /* predInterSearch sets interMode.sa8dBits, but this is ignored */
     encodeResAndCalcRdInterCU(interMode, cuGeom);
@@ -2060,10 +2062,10 @@
     cu.m_mvd[1][0] = bestME[1].mv - mvp1;
 
     PredictionUnit pu(cu, cuGeom, 0);
-    motionCompensation(cu, pu, bidir2Nx2N.predYuv, true, m_bChromaSa8d);
+    motionCompensation(cu, pu, bidir2Nx2N.predYuv, true, m_bChromaSa8d && (m_csp != X265_CSP_I400));
 
     int sa8d = primitives.cu[partEnum].sa8d(fencYuv.m_buf[0], fencYuv.m_size, bidir2Nx2N.predYuv.m_buf[0], bidir2Nx2N.predYuv.m_size);
-    if (m_bChromaSa8d)
+    if (m_bChromaSa8d && (m_csp != X265_CSP_I400))
     {
         /* Add in chroma distortion */
         sa8d += primitives.chroma[m_csp].cu[partEnum].sa8d(fencYuv.m_buf[1], fencYuv.m_csize, bidir2Nx2N.predYuv.m_buf[1], bidir2Nx2N.predYuv.m_csize);
@@ -2094,16 +2096,16 @@
 
         int zsa8d;
 
-        if (m_bChromaSa8d)
+        if (m_bChromaSa8d && (m_csp != X265_CSP_I400))
         {
             cu.m_mv[0][0] = mvzero;
             cu.m_mv[1][0] = mvzero;
 
             motionCompensation(cu, pu, tmpPredYuv, true, true);
-
             zsa8d  = primitives.cu[partEnum].sa8d(fencYuv.m_buf[0], fencYuv.m_size, tmpPredYuv.m_buf[0], tmpPredYuv.m_size);
             zsa8d += primitives.chroma[m_csp].cu[partEnum].sa8d(fencYuv.m_buf[1], fencYuv.m_csize, tmpPredYuv.m_buf[1], tmpPredYuv.m_csize);
             zsa8d += primitives.chroma[m_csp].cu[partEnum].sa8d(fencYuv.m_buf[2], fencYuv.m_csize, tmpPredYuv.m_buf[2], tmpPredYuv.m_csize);
+
         }
         else
         {
@@ -2139,13 +2141,12 @@
             cu.m_mvd[1][0] = mvzero - mvp1;
             cu.m_mvpIdx[1][0] = (uint8_t)mvpIdx1;
 
-            if (m_bChromaSa8d)
-                /* real MC was already performed */
+            if (m_bChromaSa8d) /* real MC was already performed */
                 bidir2Nx2N.predYuv.copyFromYuv(tmpPredYuv);
             else
-                motionCompensation(cu, pu, bidir2Nx2N.predYuv, true, true);
+                motionCompensation(cu, pu, bidir2Nx2N.predYuv, true, m_csp != X265_CSP_I400);
         }
-        else if (m_bChromaSa8d)
+        else if (m_bChromaSa8d && (m_csp != X265_CSP_I400))
         {
             /* recover overwritten motion vectors */
             cu.m_mv[0][0] = bestME[0].mv;
@@ -2174,7 +2175,7 @@
     Mode *bestMode = m_modeDepth[cuGeom.depth].bestMode;
     CUData& cu = bestMode->cu;
 
-    cu.copyFromPic(ctu, cuGeom);
+    cu.copyFromPic(ctu, cuGeom, m_csp);
 
     PicYuv& reconPic = *m_frame->m_reconPic;
 
@@ -2191,8 +2192,11 @@
         cu.getIntraTUQtDepthRange(tuDepthRange, 0);
 
         residualTransformQuantIntra(*bestMode, cuGeom, 0, 0, tuDepthRange);
-        getBestIntraModeChroma(*bestMode, cuGeom);
-        residualQTIntraChroma(*bestMode, cuGeom, 0, 0);
+        if (m_csp != X265_CSP_I400)
+        {
+            getBestIntraModeChroma(*bestMode, cuGeom);
+            residualQTIntraChroma(*bestMode, cuGeom, 0, 0);
+        }
     }
     else // if (cu.isInter(0))
     {
@@ -2207,20 +2211,23 @@
         /* at RD 0, the prediction pixels are accumulated into the top depth predYuv */
         Yuv& predYuv = m_modeDepth[0].bestMode->predYuv;
         pixel* predY = predYuv.getLumaAddr(absPartIdx);
-        pixel* predU = predYuv.getCbAddr(absPartIdx);
-        pixel* predV = predYuv.getCrAddr(absPartIdx);
 
         primitives.cu[sizeIdx].sub_ps(resiYuv.m_buf[0], resiYuv.m_size,
                                       fencYuv.m_buf[0], predY,
                                       fencYuv.m_size, predYuv.m_size);
 
-        primitives.chroma[m_csp].cu[sizeIdx].sub_ps(resiYuv.m_buf[1], resiYuv.m_csize,
+        if (m_csp != X265_CSP_I400)
+        {
+            pixel* predU = predYuv.getCbAddr(absPartIdx);
+            pixel* predV = predYuv.getCrAddr(absPartIdx);
+            primitives.chroma[m_csp].cu[sizeIdx].sub_ps(resiYuv.m_buf[1], resiYuv.m_csize,
                                                  fencYuv.m_buf[1], predU,
                                                  fencYuv.m_csize, predYuv.m_csize);
 
-        primitives.chroma[m_csp].cu[sizeIdx].sub_ps(resiYuv.m_buf[2], resiYuv.m_csize,
+            primitives.chroma[m_csp].cu[sizeIdx].sub_ps(resiYuv.m_buf[2], resiYuv.m_csize,
                                                  fencYuv.m_buf[2], predV,
                                                  fencYuv.m_csize, predYuv.m_csize);
+        }
 
         uint32_t tuDepthRange[2];
         cu.getInterTUQtDepthRange(tuDepthRange, 0);
@@ -2239,20 +2246,24 @@
         else
             primitives.cu[sizeIdx].copy_pp(reconPic.getLumaAddr(cu.m_cuAddr, absPartIdx), reconPic.m_stride,
                                            predY, predYuv.m_size);
-
-        if (cu.m_cbf[1][0])
-            primitives.chroma[m_csp].cu[sizeIdx].add_ps(reconPic.getCbAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
+        if (m_csp != X265_CSP_I400)
+        {
+             pixel* predU = predYuv.getCbAddr(absPartIdx);
+             pixel* predV = predYuv.getCrAddr(absPartIdx);
+            if (cu.m_cbf[1][0])
+                primitives.chroma[m_csp].cu[sizeIdx].add_ps(reconPic.getCbAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
                                                         predU, resiYuv.m_buf[1], predYuv.m_csize, resiYuv.m_csize);
-        else
-            primitives.chroma[m_csp].cu[sizeIdx].copy_pp(reconPic.getCbAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
+            else
+                primitives.chroma[m_csp].cu[sizeIdx].copy_pp(reconPic.getCbAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
                                                          predU, predYuv.m_csize);
 
-        if (cu.m_cbf[2][0])
-            primitives.chroma[m_csp].cu[sizeIdx].add_ps(reconPic.getCrAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
+            if (cu.m_cbf[2][0])
+                primitives.chroma[m_csp].cu[sizeIdx].add_ps(reconPic.getCrAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
                                                         predV, resiYuv.m_buf[2], predYuv.m_csize, resiYuv.m_csize);
-        else
-            primitives.chroma[m_csp].cu[sizeIdx].copy_pp(reconPic.getCrAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
+            else
+                primitives.chroma[m_csp].cu[sizeIdx].copy_pp(reconPic.getCrAddr(cu.m_cuAddr, absPartIdx), reconPic.m_strideC,
                                                          predV, predYuv.m_csize);
+        }
     }
 
     cu.updatePic(cuGeom.depth);
diff -r d01cd1fee4e3 -r d7200ee7910d source/encoder/search.cpp
--- a/source/encoder/search.cpp	Sun Dec 13 19:57:55 2015 +0530
+++ b/source/encoder/search.cpp	Sun Dec 13 20:07:18 2015 +0530
@@ -98,13 +98,27 @@
      * the coeffRQT and reconQtYuv are allocated to the max CU size at every depth. The parts
      * which are reconstructed at each depth are valid. At the end, the transform depth table
      * is walked and the coeff and recon at the correct depths are collected */
-    for (uint32_t i = 0; i <= m_numLayers; i++)
+
+    if (param.internalCsp != X265_CSP_I400)
     {
-        CHECKED_MALLOC(m_rqt[i].coeffRQT[0], coeff_t, sizeL + sizeC * 2);
-        m_rqt[i].coeffRQT[1] = m_rqt[i].coeffRQT[0] + sizeL;
-        m_rqt[i].coeffRQT[2] = m_rqt[i].coeffRQT[0] + sizeL + sizeC;
-        ok &= m_rqt[i].reconQtYuv.create(g_maxCUSize, param.internalCsp);
-        ok &= m_rqt[i].resiQtYuv.create(g_maxCUSize, param.internalCsp);
+        for (uint32_t i = 0; i <= m_numLayers; i++)
+        {
+            CHECKED_MALLOC(m_rqt[i].coeffRQT[0], coeff_t, sizeL + sizeC * 2);
+            m_rqt[i].coeffRQT[1] = m_rqt[i].coeffRQT[0] + sizeL;
+            m_rqt[i].coeffRQT[2] = m_rqt[i].coeffRQT[0] + sizeL + sizeC;
+            ok &= m_rqt[i].reconQtYuv.create(g_maxCUSize, param.internalCsp);
+            ok &= m_rqt[i].resiQtYuv.create(g_maxCUSize, param.internalCsp);
+        }
+    }
+    else
+    {
+        for (uint32_t i = 0; i <= m_numLayers; i++)
+        {
+            CHECKED_MALLOC(m_rqt[i].coeffRQT[0], coeff_t, sizeL);
+            m_rqt[i].coeffRQT[1] = m_rqt[i].coeffRQT[2] = NULL;
+            ok &= m_rqt[i].reconQtYuv.create(g_maxCUSize, param.internalCsp);
+            ok &= m_rqt[i].resiQtYuv.create(g_maxCUSize, param.internalCsp);
+        }
     }
 
     /* the rest of these buffers are indexed per-depth */
@@ -117,12 +131,22 @@
         ok &= m_rqt[i].bidirPredYuv[1].create(cuSize, param.internalCsp);
     }
 
-    CHECKED_MALLOC(m_qtTempCbf[0], uint8_t, numPartitions * 3);
-    m_qtTempCbf[1] = m_qtTempCbf[0] + numPartitions;
-    m_qtTempCbf[2] = m_qtTempCbf[0] + numPartitions * 2;
-    CHECKED_MALLOC(m_qtTempTransformSkipFlag[0], uint8_t, numPartitions * 3);
-    m_qtTempTransformSkipFlag[1] = m_qtTempTransformSkipFlag[0] + numPartitions;
-    m_qtTempTransformSkipFlag[2] = m_qtTempTransformSkipFlag[0] + numPartitions * 2;
+    if (param.internalCsp != X265_CSP_I400)
+    {
+        CHECKED_MALLOC(m_qtTempCbf[0], uint8_t, numPartitions * 3);
+        m_qtTempCbf[1] = m_qtTempCbf[0] + numPartitions;
+        m_qtTempCbf[2] = m_qtTempCbf[0] + numPartitions * 2;
+        CHECKED_MALLOC(m_qtTempTransformSkipFlag[0], uint8_t, numPartitions * 3);
+        m_qtTempTransformSkipFlag[1] = m_qtTempTransformSkipFlag[0] + numPartitions;
+        m_qtTempTransformSkipFlag[2] = m_qtTempTransformSkipFlag[0] + numPartitions * 2;
+    }
+    else
+    {
+        CHECKED_MALLOC(m_qtTempCbf[0], uint8_t, numPartitions);
+        m_qtTempCbf[1] = m_qtTempCbf[2] = NULL;
+        CHECKED_MALLOC(m_qtTempTransformSkipFlag[0], uint8_t, numPartitions);
+        m_qtTempTransformSkipFlag[1] = m_qtTempTransformSkipFlag[2] = NULL;
+    }
 
     CHECKED_MALLOC(m_intraPred, pixel, (32 * 32) * (33 + 3));
     m_fencScaled = m_intraPred + 32 * 32;
@@ -1166,8 +1190,13 @@
 
     intraMode.initCosts();
     intraMode.lumaDistortion += estIntraPredQT(intraMode, cuGeom, tuDepthRange);
-    intraMode.chromaDistortion += estIntraPredChromaQT(intraMode, cuGeom);
-    intraMode.distortion += intraMode.lumaDistortion + intraMode.chromaDistortion;
+    if (m_csp != X265_CSP_I400)
+    {
+        intraMode.chromaDistortion += estIntraPredChromaQT(intraMode, cuGeom);
+        intraMode.distortion += intraMode.lumaDistortion + intraMode.chromaDistortion;
+    }
+    else
+        intraMode.distortion += intraMode.lumaDistortion;
 
     m_entropyCoder.resetBits();
     if (m_slice->m_pps->bTransquantBypassEnabled)
@@ -1386,8 +1415,13 @@
     extractIntraResultQT(cu, *reconYuv, 0, 0);
 
     intraMode.lumaDistortion = icosts.distortion;
-    intraMode.chromaDistortion = estIntraPredChromaQT(intraMode, cuGeom);
-    intraMode.distortion = intraMode.lumaDistortion + intraMode.chromaDistortion;
+    if (m_csp != X265_CSP_I400)
+    {
+        intraMode.chromaDistortion = estIntraPredChromaQT(intraMode, cuGeom);
+        intraMode.distortion = intraMode.lumaDistortion + intraMode.chromaDistortion;
+    }
+    else
+        intraMode.distortion = intraMode.lumaDistortion;
 
     m_entropyCoder.resetBits();
     if (m_slice->m_pps->bTransquantBypassEnabled)
@@ -2492,11 +2526,14 @@
     // Luma
     int part = partitionFromLog2Size(cu.m_log2CUSize[0]);
     interMode.lumaDistortion = primitives.cu[part].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
+    interMode.distortion = interMode.lumaDistortion;
     // Chroma
-    interMode.chromaDistortion = m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[part].sse_pp(fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
-    interMode.chromaDistortion += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[part].sse_pp(fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
-    interMode.distortion = interMode.lumaDistortion + interMode.chromaDistortion;
-
+    if (m_csp != X265_CSP_I400)
+    {
+        interMode.chromaDistortion = m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[part].sse_pp(fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
+        interMode.chromaDistortion += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[part].sse_pp(fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
+        interMode.distortion += interMode.chromaDistortion;
+    }
     m_entropyCoder.load(m_rqt[depth].cur);
     m_entropyCoder.resetBits();
     if (m_slice->m_pps->bTransquantBypassEnabled)
@@ -2546,8 +2583,11 @@
     if (!tqBypass)
     {
         sse_t cbf0Dist = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
-        cbf0Dist += m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[1], predYuv->m_csize, predYuv->m_buf[1], predYuv->m_csize));
-        cbf0Dist += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[2], predYuv->m_csize, predYuv->m_buf[2], predYuv->m_csize));
+        if (m_csp != X265_CSP_I400)
+        {
+            cbf0Dist += m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[1], predYuv->m_csize, predYuv->m_buf[1], predYuv->m_csize));
+            cbf0Dist += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[2], predYuv->m_csize, predYuv->m_buf[2], predYuv->m_csize));
+        }
 
         /* Consider the RD cost of not signaling any residual */
         m_entropyCoder.load(m_rqt[depth].cur);
@@ -2620,15 +2660,19 @@
 
     // update with clipped distortion and cost (qp estimation loop uses unclipped values)
     sse_t bestLumaDist = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
-    sse_t bestChromaDist = m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
-    bestChromaDist += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
+    interMode.distortion = bestLumaDist;
+    if (m_csp != X265_CSP_I400)
+    {
+        sse_t bestChromaDist = m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));
+        bestChromaDist += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));
+        interMode.chromaDistortion = bestChromaDist;
+        interMode.distortion += bestChromaDist;
+    }
     if (m_rdCost.m_psyRd)
         interMode.psyEnergy = m_rdCost.psyCost(sizeIdx, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);
     interMode.resEnergy = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);
     interMode.totalBits = bits;
     interMode.lumaDistortion = bestLumaDist;
-    interMode.chromaDistortion = bestChromaDist;
-    interMode.distortion = bestLumaDist + bestChromaDist;
     interMode.coeffBits = coeffBits;
     interMode.mvBits = mvBits;
     updateModeCost(interMode);
@@ -2649,14 +2693,15 @@
     {
         // code full block
         uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
-        bool bCodeChroma = true;
+        uint32_t codeChroma = (m_csp != X265_CSP_I400) ? 1 : 0;
+
         uint32_t tuDepthC = tuDepth;
         if (log2TrSizeC < 2)
         {
             X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
             log2TrSizeC = 2;
             tuDepthC--;
-            bCodeChroma = !(absPartIdx & 3);
+            codeChroma &= !(absPartIdx & 3);
         }
 
         uint32_t absPartIdxStep = cuGeom.numPartitions >> tuDepthC * 2;
@@ -2690,7 +2735,7 @@
             cu.setCbfSubParts(0, TEXT_LUMA, absPartIdx, depth);
         }
 
-        if (bCodeChroma)
+        if (codeChroma)
         {
             uint32_t sizeIdxC = log2TrSizeC - 2;
             uint32_t strideResiC = resiYuv.m_csize;
@@ -2756,14 +2801,20 @@
         {
             residualTransformQuantInter(mode, cuGeom, qPartIdx, tuDepth + 1, depthRange);
             ycbf |= cu.getCbf(qPartIdx, TEXT_LUMA,     tuDepth + 1);
-            ucbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
-            vcbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
+            if (m_csp != X265_CSP_I400)
+            {
+                ucbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
+                vcbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
+            }
         }
         for (uint32_t i = 0; i < 4 * qNumParts; ++i)
         {
             cu.m_cbf[0][absPartIdx + i] |= ycbf << tuDepth;
-            cu.m_cbf[1][absPartIdx + i] |= ucbf << tuDepth;
-            cu.m_cbf[2][absPartIdx + i] |= vcbf << tuDepth;
+            if (m_csp != X265_CSP_I400)
+            {
+                cu.m_cbf[1][absPartIdx + i] |= ucbf << tuDepth;
+                cu.m_cbf[2][absPartIdx + i] |= vcbf << tuDepth;
+            }
         }
     }
 }
@@ -2794,14 +2845,14 @@
     X265_CHECK(bCheckFull || bCheckSplit, "check-full or check-split must be set\n");
 
     uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
-    bool bCodeChroma = true;
+    uint32_t codeChroma = (m_csp != X265_CSP_I400) ? 1 : 0;
     uint32_t tuDepthC = tuDepth;
     if (log2TrSizeC < 2)
     {
         X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
         log2TrSizeC = 2;
         tuDepthC--;
-        bCodeChroma = !(absPartIdx & 3);
+        codeChroma &= !(absPartIdx & 3);
     }
 
     // code full block
@@ -2827,14 +2878,14 @@
     if (bCheckFull)
     {
         uint32_t trSizeC = 1 << log2TrSizeC;
-        int partSize  = partitionFromLog2Size(log2TrSize);
+        int partSize = partitionFromLog2Size(log2TrSize);
         int partSizeC = partitionFromLog2Size(log2TrSizeC);
         const uint32_t qtLayer = log2TrSize - 2;
         uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
         coeff_t* coeffCurY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
 
-        bool checkTransformSkip   = m_slice->m_pps->bTransformSkipEnabled && !cu.m_tqBypass[0];
-        bool checkTransformSkipY  = checkTransformSkip && log2TrSize  <= MAX_LOG2_TS_SIZE;
+        bool checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && !cu.m_tqBypass[0];
+        bool checkTransformSkipY = checkTransformSkip && log2TrSize <= MAX_LOG2_TS_SIZE;
         bool checkTransformSkipC = checkTransformSkip && log2TrSizeC <= MAX_LOG2_TS_SIZE;
 
         cu.setTUDepthSubParts(tuDepth, absPartIdx, depth);
@@ -2844,7 +2895,7 @@
             m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
 
         const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
-        int16_t* resi     = resiYuv.getLumaAddr(absPartIdx);
+        int16_t* resi = resiYuv.getLumaAddr(absPartIdx);
         numSig[TEXT_LUMA][0] = m_quant.transformNxN(cu, fenc, fencYuv->m_size, resi, resiYuv.m_size, coeffCurY, log2TrSize, TEXT_LUMA, absPartIdx, false);
         cbfFlag[TEXT_LUMA][0] = !!numSig[TEXT_LUMA][0];
 
@@ -2865,7 +2916,7 @@
         if (m_rdCost.m_psyRd)
             zeroPsyEnergyY = m_rdCost.psyCost(partSize, fenc, fencYuv->m_size, mode.predYuv.getLumaAddr(absPartIdx), mode.predYuv.m_size);
 
-        int16_t* curResiY    = m_rqt[qtLayer].resiQtYuv.getLumaAddr(absPartIdx);
+        int16_t* curResiY = m_rqt[qtLayer].resiQtYuv.getLumaAddr(absPartIdx);
         uint32_t strideResiY = m_rqt[qtLayer].resiQtYuv.m_size;
 
         if (cbfFlag[TEXT_LUMA][0])
@@ -2874,7 +2925,7 @@
 
             // non-zero cost calculation for luma - This is an approximation
             // finally we have to encode correct cbf after comparing with null cost
-            pixel* curReconY      = m_rqt[qtLayer].reconQtYuv.getLumaAddr(absPartIdx);
+            pixel* curReconY = m_rqt[qtLayer].reconQtYuv.getLumaAddr(absPartIdx);
             uint32_t strideReconY = m_rqt[qtLayer].reconQtYuv.m_size;
             primitives.cu[partSize].add_ps(curReconY, strideReconY, mode.predYuv.getLumaAddr(absPartIdx), curResiY, mode.predYuv.m_size, strideResiY);
 
@@ -2883,7 +2934,7 @@
             uint32_t nonZeroPsyEnergyY = 0; uint64_t singleCostY = 0;
             if (m_rdCost.m_psyRd)
             {
-                nonZeroPsyEnergyY = m_rdCost.psyCost(partSize,  fenc, fencYuv->m_size, curReconY, strideReconY);
+                nonZeroPsyEnergyY = m_rdCost.psyCost(partSize, fenc, fencYuv->m_size, curReconY, strideReconY);
                 singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, nzCbfBitsY + singleBits[TEXT_LUMA][0], nonZeroPsyEnergyY);
             }
             else
@@ -2908,7 +2959,7 @@
                     primitives.cu[partSize].blockfill_s(curResiY, strideResiY, 0);
 #if CHECKED_BUILD || _DEBUG
                     uint32_t numCoeffY = 1 << (log2TrSize << 1);
-                    memset(coeffCurY, 0, sizeof(coeff_t) * numCoeffY);
+                    memset(coeffCurY, 0, sizeof(coeff_t)* numCoeffY);
 #endif
                     if (checkTransformSkipY)
                         minCost[TEXT_LUMA][0] = nullCostY;
@@ -2936,7 +2987,7 @@
 
         cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth);
 
-        if (bCodeChroma)
+        if (codeChroma)
         {
             uint32_t coeffOffsetC = coeffOffsetY >> (m_hChromaShift + m_vChromaShift);
             uint32_t strideResiC  = m_rqt[qtLayer].resiQtYuv.m_csize;
@@ -3099,7 +3150,7 @@
             cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth);
         }
 
-        if (bCodeChroma && checkTransformSkipC)
+        if (codeChroma && checkTransformSkipC)
         {
             sse_t nonZeroDistC = 0;
             uint32_t nonZeroPsyEnergyC = 0;
@@ -3180,7 +3231,7 @@
         m_entropyCoder.resetBits();
 
         //Encode cbf flags
-        if (bCodeChroma)
+        if (codeChroma)
         {
             if (!splitIntoSubTUs)
             {
@@ -3254,14 +3305,20 @@
         {
             estimateResidualQT(mode, cuGeom, qPartIdx, tuDepth + 1, resiYuv, splitCost, depthRange);
             ycbf |= cu.getCbf(qPartIdx, TEXT_LUMA,     tuDepth + 1);
-            ucbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
-            vcbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
+            if (m_csp != X265_CSP_I400)
+            {
+                ucbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_U, tuDepth + 1);
+                vcbf |= cu.getCbf(qPartIdx, TEXT_CHROMA_V, tuDepth + 1);
+            }
         }
         for (uint32_t i = 0; i < 4 * qNumParts; ++i)
         {
             cu.m_cbf[0][absPartIdx + i] |= ycbf << tuDepth;
-            cu.m_cbf[1][absPartIdx + i] |= ucbf << tuDepth;
-            cu.m_cbf[2][absPartIdx + i] |= vcbf << tuDepth;
+            if (m_csp != X265_CSP_I400)
+            {
+                cu.m_cbf[1][absPartIdx + i] |= ucbf << tuDepth;
+                cu.m_cbf[2][absPartIdx + i] |= vcbf << tuDepth;
+            }
         }
 
         // Here we were encoding cbfs and coefficients for splitted blocks. Since I have collected coefficient bits
@@ -3295,7 +3352,7 @@
         }
 
         cu.setTransformSkipSubParts(bestTransformMode[TEXT_LUMA][0], TEXT_LUMA, absPartIdx, depth);
-        if (bCodeChroma)
+        if (codeChroma)
         {
             if (!splitIntoSubTUs)
             {
@@ -3318,7 +3375,7 @@
     cu.setTUDepthSubParts(tuDepth, absPartIdx, depth);
     cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth);
 
-    if (bCodeChroma)
+    if (codeChroma)
     {
         if (!splitIntoSubTUs)
         {
@@ -3350,18 +3407,20 @@
 
     const bool bSubdiv  = tuDepth < cu.m_tuDepth[absPartIdx];
     uint32_t log2TrSize = cu.m_log2CUSize[0] - tuDepth;
-
-    if (!(log2TrSize - m_hChromaShift < 2))
+    if (m_csp != X265_CSP_I400)
     {
-        if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth - 1))
-            m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_U, tuDepth, !bSubdiv);
-        if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth - 1))
-            m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_V, tuDepth, !bSubdiv);
-    }
-    else
-    {
-        X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth - 1), "chroma CBF not matching\n");
-        X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth - 1), "chroma CBF not matching\n");
+        if (!(log2TrSize - m_hChromaShift < 2))
+        {
+            if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth - 1))
+                m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_U, tuDepth, !bSubdiv);
+            if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth - 1))
+                m_entropyCoder.codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_V, tuDepth, !bSubdiv);
+        }
+        else
+        {
+            X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth - 1), "chroma CBF not matching\n");
+            X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth - 1), "chroma CBF not matching\n");
+        }
     }
 
     if (!bSubdiv)
@@ -3391,14 +3450,14 @@
     const uint32_t qtLayer = log2TrSize - 2;
 
     uint32_t log2TrSizeC = log2TrSize - m_hChromaShift;
-    bool bCodeChroma = true;
+    uint32_t codeChroma = (m_csp != X265_CSP_I400) ? 1 : 0;
     uint32_t tuDepthC = tuDepth;
     if (log2TrSizeC < 2)
     {
         X265_CHECK(log2TrSize == 2 && m_csp != X265_CSP_I444 && tuDepth, "invalid tuDepth\n");
         log2TrSizeC = 2;
         tuDepthC--;
-        bCodeChroma = !(absPartIdx & 3);
+        codeChroma &= !(absPartIdx & 3);
     }
 
     m_rqt[qtLayer].resiQtYuv.copyPartToPartLuma(resiYuv, absPartIdx, log2TrSize);
@@ -3409,7 +3468,7 @@
     coeff_t* coeffDstY = cu.m_trCoeff[0] + coeffOffsetY;
     memcpy(coeffDstY, coeffSrcY, sizeof(coeff_t) * numCoeffY);
 
-    if (bCodeChroma)
+    if (codeChroma)
     {
         m_rqt[qtLayer].resiQtYuv.copyPartToPartChroma(resiYuv, absPartIdx, log2TrSizeC + m_hChromaShift);