[x265] [PATCH 2 of 2] refine intra tskip related.

Satoshi Nakagawa nakagawa424 at oki.com
Tue Jun 24 08:50:51 CEST 2014


# HG changeset patch
# User Satoshi Nakagawa <nakagawa424 at oki.com>
# Date 1403592156 -32400
#      Tue Jun 24 15:42:36 2014 +0900
# Node ID ed2786407c46be823515c78cf23d7e0f32ee10fc
# Parent  3af58371c5ff95fc838db106610423f2c0ee8265
refine intra tskip related.

diff -r 3af58371c5ff -r ed2786407c46 source/Lib/TLibCommon/TComYuv.cpp
--- a/source/Lib/TLibCommon/TComYuv.cpp	Tue Jun 24 15:41:55 2014 +0900
+++ b/source/Lib/TLibCommon/TComYuv.cpp	Tue Jun 24 15:42:36 2014 +0900
@@ -197,21 +197,6 @@
     primitives.luma_copy_ps[part](dst, dststride, getLumaAddr(partIdx), getStride());
 }
 
-void TComYuv::copyPartToPartChroma(ShortYuv* dstPicYuv, uint32_t partIdx, uint32_t lumaSize, uint32_t chromaId, const bool splitIntoSubTUs)
-{
-    X265_CHECK(chromaId == 1 || chromaId == 2, "invalid chroma id");
-
-    int part = splitIntoSubTUs ? NUM_CHROMA_PARTITIONS422 : partitionFromSize(lumaSize);
-
-    pixel*   src = getChromaAddr(chromaId, partIdx);
-    int16_t* dst = dstPicYuv->getChromaAddr(chromaId, partIdx);
-
-    uint32_t srcstride = getCStride();
-    uint32_t dststride = dstPicYuv->m_cwidth;
-
-    primitives.chroma[m_csp].copy_ps[part](dst, dststride, src, srcstride);
-}
-
 void TComYuv::addClip(TComYuv* srcYuv0, ShortYuv* srcYuv1, uint32_t partSize)
 {
     int part = partitionFromSize(partSize);
diff -r 3af58371c5ff -r ed2786407c46 source/Lib/TLibCommon/TComYuv.h
--- a/source/Lib/TLibCommon/TComYuv.h	Tue Jun 24 15:41:55 2014 +0900
+++ b/source/Lib/TLibCommon/TComYuv.h	Tue Jun 24 15:42:36 2014 +0900
@@ -131,7 +131,6 @@
     void    copyPartToPartYuv(TComYuv* dstPicYuv, uint32_t partIdx, uint32_t width, uint32_t height, bool bLuma, bool bChroma);
 
     void    copyPartToPartLuma(ShortYuv* dstPicYuv, uint32_t partIdx, uint32_t lumaSize);
-    void    copyPartToPartChroma(ShortYuv* dstPicYuv, uint32_t partIdx, uint32_t lumaSize, uint32_t chromaId, const bool splitIntoSubTUs);
 
     // ------------------------------------------------------------------------------------------------------------------
     //  Algebraic operation for YUV buffer
diff -r 3af58371c5ff -r ed2786407c46 source/Lib/TLibEncoder/TEncSearch.cpp
--- a/source/Lib/TLibEncoder/TEncSearch.cpp	Tue Jun 24 15:41:55 2014 +0900
+++ b/source/Lib/TLibEncoder/TEncSearch.cpp	Tue Jun 24 15:42:36 2014 +0900
@@ -57,9 +57,6 @@
     m_qtTempCoeff[2] = NULL;
     m_qtTempTrIdx = NULL;
     m_qtTempShortYuv = NULL;
-    m_qtTempTUCoeff[0] = NULL;
-    m_qtTempTUCoeff[1] = NULL;
-    m_qtTempTUCoeff[2] = NULL;
     for (int i = 0; i < 3; i++)
     {
         m_qtTempTransformSkipFlag[i] = NULL;
@@ -83,14 +80,12 @@
         m_qtTempShortYuv[i].destroy();
     }
 
-    X265_FREE(m_qtTempTUCoeff[0]);
     X265_FREE(m_qtTempTrIdx);
     X265_FREE(m_qtTempCbf[0]);
     X265_FREE(m_qtTempTransformSkipFlag[0]);
 
     delete[] m_qtTempCoeff[0];
     delete[] m_qtTempShortYuv;
-    m_qtTempTransformSkipYuv.destroy();
 }
 
 bool TEncSearch::init(Encoder* top, RDCost* rdCost, TComTrQuant* trQuant)
@@ -133,11 +128,7 @@
     m_qtTempTransformSkipFlag[1] = m_qtTempTransformSkipFlag[0] + numPartitions;
     m_qtTempTransformSkipFlag[2] = m_qtTempTransformSkipFlag[0] + numPartitions * 2;
 
-    CHECKED_MALLOC(m_qtTempTUCoeff[0], coeff_t, MAX_CU_SIZE * MAX_CU_SIZE * 3);
-    m_qtTempTUCoeff[1] = m_qtTempTUCoeff[0] + MAX_CU_SIZE * MAX_CU_SIZE;
-    m_qtTempTUCoeff[2] = m_qtTempTUCoeff[0] + MAX_CU_SIZE * MAX_CU_SIZE * 2;
-
-    return m_qtTempTransformSkipYuv.create(g_maxCUSize, g_maxCUSize, m_param->internalCsp);
+    return true;
 
 fail:
     return false;
@@ -224,7 +215,7 @@
     }
 }
 
-void TEncSearch::xEncCoeffQT(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, TextType ttype, const bool splitIntoSubTUs)
+void TEncSearch::xEncCoeffQT(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, TextType ttype)
 {
     if (!cu->getCbf(absPartIdx, ttype, trDepth))
         return;
@@ -238,7 +229,7 @@
         uint32_t qtPartNum = cu->getPic()->getNumPartInCU() >> ((fullDepth + 1) << 1);
         for (uint32_t part = 0; part < 4; part++)
         {
-            xEncCoeffQT(cu, trDepth + 1, absPartIdx + part * qtPartNum, ttype, splitIntoSubTUs);
+            xEncCoeffQT(cu, trDepth + 1, absPartIdx + part * qtPartNum, ttype);
         }
 
         return;
@@ -254,8 +245,7 @@
         trDepth--;
         uint32_t qpdiv = cu->getPic()->getNumPartInCU() >> ((cu->getDepth(0) + trDepth) << 1);
         bool bFirstQ = ((absPartIdx & (qpdiv - 1)) == 0);
-        bool bSecondQ = (chFmt == CHROMA_422 && splitIntoSubTUs) ? ((absPartIdx & (qpdiv - 1)) == 2) : false;
-        if ((!bFirstQ) && (!bSecondQ))
+        if (!bFirstQ)
         {
             return;
         }
@@ -267,7 +257,6 @@
     int cspy = chroma ? m_vChromaShift : 0;
     uint32_t width = cu->getCUSize(0) >> (trDepth + cspx);
     uint32_t height = cu->getCUSize(0) >> (trDepth + cspy);
-    height = splitIntoSubTUs ? height >> 1 : height;
     uint32_t coeffOffset = absPartIdx << (cu->getPic()->getLog2UnitSize() * 2 - (cspx + cspy));
     uint32_t qtLayer = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - log2TrSize;
     coeff_t* coeff = m_qtTempCoeff[ttype][qtLayer] + coeffOffset;
@@ -370,20 +359,34 @@
 
     if (bLuma)
     {
-        xEncCoeffQT(cu, trDepth, absPartIdx, TEXT_LUMA, false);
+        xEncCoeffQT(cu, trDepth, absPartIdx, TEXT_LUMA);
     }
     if (bChroma)
     {
-        xEncCoeffQT(cu, trDepth, absPartIdx, TEXT_CHROMA_U, false);
-        xEncCoeffQT(cu, trDepth, absPartIdx, TEXT_CHROMA_V, false);
+        xEncCoeffQT(cu, trDepth, absPartIdx, TEXT_CHROMA_U);
+        xEncCoeffQT(cu, trDepth, absPartIdx, TEXT_CHROMA_V);
     }
     return m_entropyCoder->getNumberOfWrittenBits();
 }
 
-uint32_t TEncSearch::xGetIntraBitsQTChroma(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, uint32_t chromaId, const bool splitIntoSubTUs)
+uint32_t TEncSearch::xGetIntraBitsQTLuma(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, uint32_t log2TrSize, coeff_t* coeff)
 {
     m_entropyCoder->resetBits();
-    xEncCoeffQT(cu, trDepth, absPartIdx, (TextType)chromaId, splitIntoSubTUs);
+    xEncIntraHeader(cu, trDepth, absPartIdx, true, false);
+    xEncSubdivCbfQT(cu, trDepth, absPartIdx, 0, cu->getCUSize(absPartIdx), cu->getCUSize(absPartIdx), true, false);
+
+    if (cu->getCbf(absPartIdx, TEXT_LUMA, trDepth))
+    {
+        m_entropyCoder->encodeCoeffNxN(cu, coeff, absPartIdx, 1 << log2TrSize, TEXT_LUMA);
+    }
+
+    return m_entropyCoder->getNumberOfWrittenBits();
+}
+
+uint32_t TEncSearch::xGetIntraBitsQTChroma(TComDataCU* cu, uint32_t absPartIdx, uint32_t log2TrSizeC, uint32_t chromaId, coeff_t* coeff)
+{
+    m_entropyCoder->resetBits();
+    m_entropyCoder->encodeCoeffNxN(cu, coeff, absPartIdx, 1 << log2TrSizeC, (TextType)chromaId);
     return m_entropyCoder->getNumberOfWrittenBits();
 }
 
@@ -393,6 +396,9 @@
                                      TComYuv*    fencYuv,
                                      TComYuv*    predYuv,
                                      ShortYuv*   resiYuv,
+                                     int16_t*    reconQt,
+                                     uint32_t    reconQtStride,
+                                     coeff_t*    coeff,
                                      uint32_t&   cbf,
                                      uint32_t&   outDist)
 {
@@ -402,12 +408,6 @@
     pixel*   pred         = predYuv->getLumaAddr(absPartIdx);
     int16_t* residual     = resiYuv->getLumaAddr(absPartIdx);
 
-    uint32_t qtLayer        = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - log2TrSize;
-    uint32_t coeffOffsetY   = absPartIdx << cu->getPic()->getLog2UnitSize() * 2;
-    coeff_t* coeff          = m_qtTempCoeff[0][qtLayer] + coeffOffsetY;
-    int16_t* reconQt        = m_qtTempShortYuv[qtLayer].getLumaAddr(absPartIdx);
-    X265_CHECK(m_qtTempShortYuv[qtLayer].m_width == MAX_CU_SIZE, "width is not max CU size\n");
-    const uint32_t reconQtStride = MAX_CU_SIZE;
     uint32_t zorder           = cu->getZorderIdxInCU() + absPartIdx;
     pixel*   reconIPred       = cu->getPic()->getPicYuvRec()->getLumaAddr(cu->getAddr(), zorder);
     uint32_t reconIPredStride = cu->getPic()->getPicYuvRec()->getStride();
@@ -468,10 +468,12 @@
 
 void TEncSearch::xIntraCodingChromaBlk(TComDataCU* cu,
                                        uint32_t    absPartIdx,
-                                       uint32_t    log2TrSize,
                                        TComYuv*    fencYuv,
                                        TComYuv*    predYuv,
                                        ShortYuv*   resiYuv,
+                                       int16_t*    reconQt,
+                                       uint32_t    reconQtStride,
+                                       coeff_t*    coeff,
                                        uint32_t&   cbf,
                                        uint32_t&   outDist,
                                        uint32_t    chromaId,
@@ -484,11 +486,6 @@
     pixel*   pred         = predYuv->getChromaAddr(chromaId, absPartIdx);
     int16_t* residual     = resiYuv->getChromaAddr(chromaId, absPartIdx);
 
-    uint32_t qtLayer        = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - log2TrSize;
-    uint32_t coeffOffsetC   = absPartIdx << (cu->getPic()->getLog2UnitSize() * 2 - (m_hChromaShift + m_vChromaShift));
-    coeff_t* coeff          = m_qtTempCoeff[chromaId][qtLayer] + coeffOffsetC;
-    int16_t* reconQt        = m_qtTempShortYuv[qtLayer].getChromaAddr(chromaId, absPartIdx);
-    uint32_t reconQtStride  = m_qtTempShortYuv[qtLayer].m_cwidth;
     uint32_t zorder           = cu->getZorderIdxInCU() + absPartIdx;
     pixel*   reconIPred       = cu->getPic()->getPicYuvRec()->getChromaAddr(chromaId, cu->getAddr(), zorder);
     uint32_t reconIPredStride = cu->getPic()->getPicYuvRec()->getCStride();
@@ -641,6 +638,13 @@
 
         cu->setTrIdxSubParts(trDepth, absPartIdx, fullDepth);
 
+        uint32_t qtLayer        = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - log2TrSize;
+        uint32_t coeffOffsetY   = absPartIdx << cu->getPic()->getLog2UnitSize() * 2;
+        coeff_t* coeffY         = m_qtTempCoeff[0][qtLayer] + coeffOffsetY;
+        int16_t* reconQt        = m_qtTempShortYuv[qtLayer].getLumaAddr(absPartIdx);
+        X265_CHECK(m_qtTempShortYuv[qtLayer].m_width == MAX_CU_SIZE, "width is not max CU size\n");
+        const uint32_t reconQtStride = MAX_CU_SIZE;
+
         if (checkTransformSkip || checkTQbypass)
         {
             //----- store original entropy coding status -----
@@ -653,8 +657,15 @@
             bool      singleTQbypass = 0;
             const int firstCheckId   = 0;
 
+            ALIGN_VAR_32(coeff_t, tsCoeffY[32 * 32]);
+            ALIGN_VAR_32(int16_t, tsReconY[32 * 32]);
+
             for (int modeId = firstCheckId; modeId < 2; modeId++)
             {
+                coeff_t* coeff = (modeId ? tsCoeffY : coeffY);
+                int16_t* recon = (modeId ? tsReconY : reconQt);
+                uint32_t reconStride = (modeId ? tuSize : reconQtStride);
+
                 singleDistYTmp = 0;
                 singlePsyEnergyYTmp = 0;
                 cu->setTransformSkipSubParts(checkTransformSkip ? modeId : 0, TEXT_LUMA, absPartIdx, fullDepth);
@@ -666,7 +677,7 @@
                 }
 
                 //----- code luma block with given intra prediction mode and store Cbf-----
-                xIntraCodingLumaBlk(cu, absPartIdx, log2TrSize, fencYuv, predYuv, resiYuv, singleCbfYTmp, singleDistYTmp);
+                xIntraCodingLumaBlk(cu, absPartIdx, log2TrSize, fencYuv, predYuv, resiYuv, recon, reconStride, coeff, singleCbfYTmp, singleDistYTmp);
                 if (m_rdCost->psyRdEnabled())
                 {
                     uint32_t zorder = cu->getZorderIdxInCU() + absPartIdx;
@@ -679,11 +690,11 @@
                 if ((modeId == 1) && (singleCbfYTmp == 0) && checkTransformSkip)
                 {
                     // In order not to code TS flag when cbf is zero, the case for TS with cbf being zero is forbidden.
-                    singleCostTmp = MAX_INT64;
+                    break;
                 }
                 else
                 {
-                    uint32_t singleBits = xGetIntraBitsQT(cu, trDepth, absPartIdx, 0, true, false);
+                    uint32_t singleBits = xGetIntraBitsQTLuma(cu, trDepth, absPartIdx, log2TrSize, coeff);
                     if (m_rdCost->psyRdEnabled())
                         singleCostTmp = m_rdCost->calcPsyRdCost(singleDistYTmp, singleBits, singlePsyEnergyYTmp);
                     else
@@ -700,7 +711,6 @@
                     bestModeId   = modeId;
                     if (bestModeId == firstCheckId)
                     {
-                        xStoreIntraResultQT(cu, absPartIdx, log2TrSize);
                         m_rdGoOnSbacCoder->store(m_rdSbacCoders[fullDepth][CI_TEMP_BEST]);
                     }
                 }
@@ -718,10 +728,16 @@
 
             if (bestModeId == firstCheckId)
             {
-                xLoadIntraResultQT(cu, absPartIdx, log2TrSize);
+                xLoadIntraResultQT(cu, absPartIdx, log2TrSize, reconQt, reconQtStride);
                 cu->setCbfSubParts(singleCbfY << trDepth, TEXT_LUMA, absPartIdx, fullDepth);
                 m_rdGoOnSbacCoder->load(m_rdSbacCoders[fullDepth][CI_TEMP_BEST]);
             }
+            else
+            {
+                ::memcpy(coeffY, tsCoeffY, sizeof(coeff_t) << (log2TrSize * 2));
+                int sizeIdx = log2TrSize - 2;
+                primitives.square_copy_ss[sizeIdx](reconQt, reconQtStride, tsReconY, tuSize);
+            }
         }
         else
         {
@@ -729,7 +745,7 @@
 
             //----- code luma block with given intra prediction mode and store Cbf-----
             cu->setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
-            xIntraCodingLumaBlk(cu, absPartIdx, log2TrSize, fencYuv, predYuv, resiYuv, singleCbfY, singleDistY);
+            xIntraCodingLumaBlk(cu, absPartIdx, log2TrSize, fencYuv, predYuv, resiYuv, reconQt, reconQtStride, coeffY, singleCbfY, singleDistY);
             if (m_rdCost->psyRdEnabled())
             {
                 uint32_t zorder = cu->getZorderIdxInCU() + absPartIdx;
@@ -738,7 +754,7 @@
             }
             cu->setCbfSubParts(singleCbfY << trDepth, TEXT_LUMA, absPartIdx, fullDepth);
 
-            uint32_t singleBits = xGetIntraBitsQT(cu, trDepth, absPartIdx, 0, true, false);
+            uint32_t singleBits = xGetIntraBitsQTLuma(cu, trDepth, absPartIdx, log2TrSize, coeffY);
             if (m_param->rdPenalty && (log2TrSize == 5) && !isIntraSlice)
                 singleBits *= 4;
 
@@ -960,11 +976,10 @@
         uint32_t qtLayer    = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - log2TrSize;
 
         //===== copy transform coefficients =====
-        uint32_t numCoeffY    = 1 << (log2TrSize * 2);
         uint32_t coeffOffsetY = absPartIdx << cu->getPic()->getLog2UnitSize() * 2;
         coeff_t* coeffSrcY    = m_qtTempCoeff[0][qtLayer] + coeffOffsetY;
         coeff_t* coeffDestY   = cu->getCoeffY()           + coeffOffsetY;
-        ::memcpy(coeffDestY, coeffSrcY, sizeof(coeff_t) * numCoeffY);
+        ::memcpy(coeffDestY, coeffSrcY, sizeof(coeff_t) << (log2TrSize * 2));
 
         //===== copy reconstruction =====
         m_qtTempShortYuv[qtLayer].copyPartToPartLuma(reconYuv, absPartIdx, 1 << log2TrSize);
@@ -979,100 +994,28 @@
     }
 }
 
-void TEncSearch::xStoreIntraResultQT(TComDataCU* cu, uint32_t absPartIdx, uint32_t log2TrSize)
+void TEncSearch::xLoadIntraResultQT(TComDataCU* cu, uint32_t absPartIdx, uint32_t log2TrSize,
+                                    int16_t* reconQt, uint32_t reconQtStride)
 {
-    uint32_t qtLayer = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - log2TrSize;
-
-    //===== copy transform coefficients =====
-    uint32_t numCoeffY = 1 << (log2TrSize * 2);
-    uint32_t coeffOffsetY = absPartIdx << cu->getPic()->getLog2UnitSize() * 2;
-    coeff_t* coeffSrcY = m_qtTempCoeff[0][qtLayer] + coeffOffsetY;
-    coeff_t* coeffDstY = m_qtTempTUCoeff[0];
-    ::memcpy(coeffDstY, coeffSrcY, sizeof(coeff_t) * numCoeffY);
-
     //===== copy reconstruction =====
-    pixel*   reconTs       = m_qtTempTransformSkipYuv.getLumaAddr(absPartIdx);
-    uint32_t reconTsStride = m_qtTempTransformSkipYuv.getStride();
-    int16_t* reconQt       = m_qtTempShortYuv[qtLayer].getLumaAddr(absPartIdx);
-    X265_CHECK(m_qtTempShortYuv[qtLayer].m_width == MAX_CU_SIZE, "width is not max CU size\n");
-    const uint32_t reconQtStride = MAX_CU_SIZE;
     int sizeIdx = log2TrSize - 2;
-    primitives.square_copy_sp[sizeIdx](reconTs, reconTsStride, reconQt, reconQtStride);
-}
-
-void TEncSearch::xLoadIntraResultQT(TComDataCU* cu, uint32_t absPartIdx, uint32_t log2TrSize)
-{
-    uint32_t qtLayer = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - log2TrSize;
-
-    //===== copy transform coefficients =====
-    uint32_t numCoeffY = 1 << (log2TrSize * 2);
-    uint32_t coeffOffsetY = absPartIdx << cu->getPic()->getLog2UnitSize() * 2;
-    coeff_t* coeffDstY = m_qtTempCoeff[0][qtLayer] + coeffOffsetY;
-    coeff_t* coeffSrcY = m_qtTempTUCoeff[0];
-    ::memcpy(coeffDstY, coeffSrcY, sizeof(coeff_t) * numCoeffY);
-
-    //===== copy reconstruction =====
-    pixel*   reconTs       = m_qtTempTransformSkipYuv.getLumaAddr(absPartIdx);
-    uint32_t reconTsStride = m_qtTempTransformSkipYuv.getStride();
-    int16_t* reconQt       = m_qtTempShortYuv[qtLayer].getLumaAddr(absPartIdx);
-    X265_CHECK(m_qtTempShortYuv[qtLayer].m_width == MAX_CU_SIZE, "width is not max CU size\n");
-    const uint32_t reconQtStride = MAX_CU_SIZE;
-    int sizeIdx = log2TrSize - 2;
-    primitives.square_copy_ps[sizeIdx](reconQt, reconQtStride, reconTs, reconTsStride);
-
     uint32_t zorder           = cu->getZorderIdxInCU() + absPartIdx;
     pixel*   reconIPred       = cu->getPic()->getPicYuvRec()->getLumaAddr(cu->getAddr(), zorder);
     uint32_t reconIPredStride = cu->getPic()->getPicYuvRec()->getStride();
-    primitives.square_copy_pp[sizeIdx](reconIPred, reconIPredStride, reconTs, reconTsStride);
+    primitives.square_copy_sp[sizeIdx](reconIPred, reconIPredStride, reconQt, reconQtStride);
 }
 
-void TEncSearch::xStoreIntraResultChromaQT(TComDataCU* cu, uint32_t absPartIdx, uint32_t log2TrSize, uint32_t log2TrSizeC, uint32_t chromaId)
+void TEncSearch::xLoadIntraResultChromaQT(TComDataCU* cu, uint32_t absPartIdx, uint32_t log2TrSizeC, uint32_t chromaId,
+                                          int16_t* reconQt, uint32_t reconQtStride)
 {
     X265_CHECK(chromaId == 1 || chromaId == 2, "invalid chroma id");
 
-    uint32_t qtLayer = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - log2TrSize;
-
-    //===== copy transform coefficients =====
-    uint32_t numCoeffC = 1 << (log2TrSizeC * 2);
-    uint32_t coeffOffsetC = absPartIdx << (cu->getPic()->getLog2UnitSize() * 2 - (m_hChromaShift + m_vChromaShift));
-    coeff_t* coeffSrcC = m_qtTempCoeff[chromaId][qtLayer] + coeffOffsetC;
-    coeff_t* coeffDstC = m_qtTempTUCoeff[chromaId];
-    ::memcpy(coeffDstC, coeffSrcC, sizeof(coeff_t) * numCoeffC);
-
     //===== copy reconstruction =====
-    pixel*   reconTs       = m_qtTempTransformSkipYuv.getChromaAddr(chromaId, absPartIdx);
-    uint32_t reconTsStride = m_qtTempTransformSkipYuv.getCStride();
-    int16_t* reconQt       = m_qtTempShortYuv[qtLayer].getChromaAddr(chromaId, absPartIdx);
-    uint32_t reconQtStride = m_qtTempShortYuv[qtLayer].m_cwidth;
     int sizeIdxC = log2TrSizeC - 2;
-    primitives.square_copy_sp[sizeIdxC](reconTs, reconTsStride, reconQt, reconQtStride);
-}
-
-void TEncSearch::xLoadIntraResultChromaQT(TComDataCU* cu, uint32_t absPartIdx, uint32_t log2TrSize, uint32_t log2TrSizeC, uint32_t chromaId)
-{
-    X265_CHECK(chromaId == 1 || chromaId == 2, "invalid chroma id");
-
-    uint32_t qtLayer = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - log2TrSize;
-
-    //===== copy transform coefficients =====
-    uint32_t numCoeffC = 1 << (log2TrSizeC * 2);
-    uint32_t coeffOffsetC = absPartIdx << (cu->getPic()->getLog2UnitSize() * 2 - (m_hChromaShift + m_vChromaShift));
-    coeff_t* coeffDstC = m_qtTempCoeff[chromaId][qtLayer] + coeffOffsetC;
-    coeff_t* coeffSrcC = m_qtTempTUCoeff[chromaId];
-    ::memcpy(coeffDstC, coeffSrcC, sizeof(coeff_t) * numCoeffC);
-
-    //===== copy reconstruction =====
-    pixel*   reconTs       = m_qtTempTransformSkipYuv.getChromaAddr(chromaId, absPartIdx);
-    uint32_t reconTsStride = m_qtTempTransformSkipYuv.getCStride();
-    int16_t* reconQt       = m_qtTempShortYuv[qtLayer].getChromaAddr(chromaId, absPartIdx);
-    uint32_t reconQtStride = m_qtTempShortYuv[qtLayer].m_cwidth;
-    int sizeIdxC = log2TrSizeC - 2;
-    primitives.square_copy_ps[sizeIdxC](reconQt, reconQtStride, reconTs, reconTsStride);
-
     uint32_t zorder           = cu->getZorderIdxInCU() + absPartIdx;
     pixel*   reconIPred       = cu->getPic()->getPicYuvRec()->getChromaAddr(chromaId, cu->getAddr(), zorder);
     uint32_t reconIPredStride = cu->getPic()->getPicYuvRec()->getCStride();
-    primitives.square_copy_pp[sizeIdxC](reconIPred, reconIPredStride, reconTs, reconTsStride);
+    primitives.square_copy_sp[sizeIdxC](reconIPred, reconIPredStride, reconQt, reconQtStride);
 }
 
 void TEncSearch::offsetSubTUCBFs(TComDataCU* cu, TextType ttype, uint32_t trDepth, uint32_t absPartIdx)
@@ -1150,6 +1093,8 @@
                                    log2TrSizeC <= LOG2_MAX_TS_SIZE &&
                                    !cu->getCUTransquantBypass(0));
 
+        uint32_t qtLayer = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - log2TrSize;
+
         if (m_param->bEnableTSkipFast)
         {
             checkTransformSkip &= ((cu->getCUSize(0) >> trDepth) <= 4);
@@ -1194,6 +1139,11 @@
                 uint32_t singleCbfC     = 0;
                 uint32_t singlePsyEnergyTmp = 0;
 
+                int16_t* reconQt        = m_qtTempShortYuv[qtLayer].getChromaAddr(chromaId, absPartIdxC);
+                uint32_t reconQtStride  = m_qtTempShortYuv[qtLayer].m_cwidth;
+                uint32_t coeffOffsetC   = absPartIdxC << (cu->getPic()->getLog2UnitSize() * 2 - (m_hChromaShift + m_vChromaShift));
+                coeff_t* coeffC         = m_qtTempCoeff[chromaId][qtLayer] + coeffOffsetC;
+
                 if (checkTransformSkip)
                 {
                     // use RDO to decide whether Cr/Cb takes TS
@@ -1208,22 +1158,29 @@
 
                     const int firstCheckId  = 0;
 
+                    ALIGN_VAR_32(coeff_t, tsCoeffC[MAX_TS_SIZE * MAX_TS_SIZE]);
+                    ALIGN_VAR_32(int16_t, tsReconC[MAX_TS_SIZE * MAX_TS_SIZE]);
+
                     for (int chromaModeId = firstCheckId; chromaModeId < 2; chromaModeId++)
                     {
+                        coeff_t* coeff = (chromaModeId ? tsCoeffC : coeffC);
+                        int16_t* recon = (chromaModeId ? tsReconC : reconQt);
+                        uint32_t reconStride = (chromaModeId ? tuSize : reconQtStride);
+
                         cu->setTransformSkipPartRange(chromaModeId, (TextType)chromaId, absPartIdxC, tuIterator.m_absPartIdxStep);
 
                         singleDistCTmp = 0;
-                        xIntraCodingChromaBlk(cu, absPartIdxC, log2TrSize, fencYuv, predYuv, resiYuv, singleCbfCTmp, singleDistCTmp, chromaId, log2TrSizeC);
+                        xIntraCodingChromaBlk(cu, absPartIdxC, fencYuv, predYuv, resiYuv, recon, reconStride, coeff, singleCbfCTmp, singleDistCTmp, chromaId, log2TrSizeC);
                         cu->setCbfPartRange(singleCbfCTmp << trDepth, (TextType)chromaId, absPartIdxC, tuIterator.m_absPartIdxStep);
 
                         if (chromaModeId == 1 && singleCbfCTmp == 0)
                         {
                             //In order not to code TS flag when cbf is zero, the case for TS with cbf being zero is forbidden.
-                            singleCostTmp = MAX_INT64;
+                            break;
                         }
                         else
                         {
-                            uint32_t bitsTmp = singleCbfCTmp ? xGetIntraBitsQTChroma(cu, trDepth, absPartIdxC, chromaId, splitIntoSubTUs) : 0;
+                            uint32_t bitsTmp = singleCbfCTmp ? xGetIntraBitsQTChroma(cu, absPartIdxC, log2TrSizeC, chromaId, coeff) : 0;
                             if (m_rdCost->psyRdEnabled())
                             {
                                 uint32_t zorder = cu->getZorderIdxInCU() + absPartIdxC;
@@ -1244,7 +1201,6 @@
                             singlePsyEnergy = singlePsyEnergyTmp;
                             if (bestModeId == firstCheckId)
                             {
-                                xStoreIntraResultChromaQT(cu, absPartIdxC, log2TrSize, log2TrSizeC, chromaId);
                                 m_rdGoOnSbacCoder->store(m_rdSbacCoders[fullDepth][CI_TEMP_BEST]);
                             }
                         }
@@ -1256,11 +1212,16 @@
 
                     if (bestModeId == firstCheckId)
                     {
-                        xLoadIntraResultChromaQT(cu, absPartIdxC, log2TrSize, log2TrSizeC, chromaId);
+                        xLoadIntraResultChromaQT(cu, absPartIdxC, log2TrSizeC, chromaId, reconQt, reconQtStride);
                         cu->setCbfPartRange(singleCbfC << trDepth, (TextType)chromaId, absPartIdxC, tuIterator.m_absPartIdxStep);
-
                         m_rdGoOnSbacCoder->load(m_rdSbacCoders[fullDepth][CI_TEMP_BEST]);
                     }
+                    else
+                    {
+                        ::memcpy(coeffC, tsCoeffC, sizeof(coeff_t) << (log2TrSizeC * 2));
+                        int sizeIdxC = log2TrSizeC - 2;
+                        primitives.square_copy_ss[sizeIdxC](reconQt, reconQtStride, tsReconC, tuSize);
+                    }
 
                     cu->setTransformSkipPartRange(bestModeId, (TextType)chromaId, absPartIdxC, tuIterator.m_absPartIdxStep);
 
@@ -1274,7 +1235,7 @@
                 else
                 {
                     cu->setTransformSkipPartRange(0, (TextType)chromaId, absPartIdxC, tuIterator.m_absPartIdxStep);
-                    xIntraCodingChromaBlk(cu, absPartIdxC, log2TrSize, fencYuv, predYuv, resiYuv, singleCbfC, outDist, chromaId, log2TrSizeC);
+                    xIntraCodingChromaBlk(cu, absPartIdxC, fencYuv, predYuv, resiYuv, reconQt, reconQtStride, coeffC, singleCbfC, outDist, chromaId, log2TrSizeC);
                     if (m_rdCost->psyRdEnabled())
                     {
                         uint32_t zorder = cu->getZorderIdxInCU() + absPartIdxC;
diff -r 3af58371c5ff -r ed2786407c46 source/Lib/TLibEncoder/TEncSearch.h
--- a/source/Lib/TLibEncoder/TEncSearch.h	Tue Jun 24 15:41:55 2014 +0900
+++ b/source/Lib/TLibEncoder/TEncSearch.h	Tue Jun 24 15:42:36 2014 +0900
@@ -116,9 +116,7 @@
     uint8_t*        m_qtTempTrIdx;
     uint8_t*        m_qtTempCbf[3];
 
-    coeff_t*        m_qtTempTUCoeff[3];
     uint8_t*        m_qtTempTransformSkipFlag[3];
-    TComYuv         m_qtTempTransformSkipYuv;
 
 public:
     // interface to classes
@@ -202,15 +200,18 @@
 
     void xEncSubdivCbfQT(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx,  uint32_t absPartIdxStep, uint32_t width, uint32_t height, bool bLuma, bool bChroma);
 
-    void xEncCoeffQT(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, TextType ttype, const bool splitIntoSubTUs);
+    void xEncCoeffQT(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, TextType ttype);
     void xEncIntraHeader(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, bool bLuma, bool bChroma);
     uint32_t xGetIntraBitsQT(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, uint32_t absPartIdxStep, bool bLuma, bool bChroma);
-    uint32_t xGetIntraBitsQTChroma(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, uint32_t chromaId, const bool splitIntoSubTUs);
-    void xIntraCodingLumaBlk(TComDataCU* cu, uint32_t absPartIdx, uint32_t log2TrSize, TComYuv* fencYuv, TComYuv* predYuv,
-                             ShortYuv* resiYuv, uint32_t& cbf, uint32_t& outDist);
+    uint32_t xGetIntraBitsQTLuma(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, uint32_t log2TrSize, coeff_t* coeff);
+    uint32_t xGetIntraBitsQTChroma(TComDataCU* cu, uint32_t absPartIdx, uint32_t log2TrSizeC, uint32_t chromaId, coeff_t* coeff);
+    void xIntraCodingLumaBlk(TComDataCU* cu, uint32_t absPartIdx, uint32_t log2TrSize, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv,
+                             int16_t* reconQt, uint32_t reconQtStride, coeff_t* coeff,
+                             uint32_t& cbf, uint32_t& outDist);
 
-    void xIntraCodingChromaBlk(TComDataCU* cu, uint32_t absPartIdx, uint32_t log2TrSize, TComYuv* fencYuv, TComYuv* predYuv,
-                               ShortYuv* resiYuv, uint32_t& cbf, uint32_t& outDist, uint32_t chromaId, uint32_t log2TrSizeC);
+    void xIntraCodingChromaBlk(TComDataCU* cu, uint32_t absPartIdx, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv,
+                               int16_t* reconQt, uint32_t reconQtStride, coeff_t* coeff,
+                               uint32_t& cbf, uint32_t& outDist, uint32_t chromaId, uint32_t log2TrSizeC);
 
     void xRecurIntraChromaCodingQT(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, TComYuv* fencYuv,
                                    TComYuv* predYuv, ShortYuv* resiYuv, uint32_t& outDist);
@@ -222,10 +223,10 @@
 
     void xSetIntraResultChromaQT(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, TComYuv* reconYuv);
 
-    void xStoreIntraResultQT(TComDataCU* cu, uint32_t absPartIdx, uint32_t log2TrSize);
-    void xLoadIntraResultQT(TComDataCU* cu, uint32_t absPartIdx, uint32_t log2TrSize);
-    void xStoreIntraResultChromaQT(TComDataCU* cu, uint32_t absPartIdx, uint32_t log2TrSize, uint32_t log2TrSizeC, uint32_t chromaId);
-    void xLoadIntraResultChromaQT(TComDataCU* cu, uint32_t absPartIdx, uint32_t log2TrSize, uint32_t log2TrSizeC, uint32_t chromaId);
+    void xLoadIntraResultQT(TComDataCU* cu, uint32_t absPartIdx, uint32_t log2TrSize,
+                            int16_t* reconQt, uint32_t reconQtStride);
+    void xLoadIntraResultChromaQT(TComDataCU* cu, uint32_t absPartIdx, uint32_t log2TrSizeC, uint32_t chromaId,
+                                  int16_t* reconQt, uint32_t reconQtStride);
 
     // --------------------------------------------------------------------------------------------
     // Inter search (AMP)
diff -r 3af58371c5ff -r ed2786407c46 source/common/shortyuv.cpp
--- a/source/common/shortyuv.cpp	Tue Jun 24 15:41:55 2014 +0900
+++ b/source/common/shortyuv.cpp	Tue Jun 24 15:42:36 2014 +0900
@@ -212,16 +212,3 @@
     uint32_t dstStride = dstPicYuv->m_cwidth;
     primitives.chroma[m_csp].copy_ss[part](dst, dstStride, src, srcStride);
 }
-
-void ShortYuv::copyPartToPartYuvChroma(TComYuv* dstPicYuv, uint32_t partIdx, uint32_t lumaSize, uint32_t chromaId, const bool splitIntoSubTUs)
-{
-    X265_CHECK(chromaId == 1 || chromaId == 2, "invalid chroma id");
-
-    int part = splitIntoSubTUs ? NUM_CHROMA_PARTITIONS422 : partitionFromSize(lumaSize);
-
-    int16_t* src = getChromaAddr(chromaId, partIdx);
-    pixel* dst = dstPicYuv->getChromaAddr(chromaId, partIdx);
-    uint32_t srcStride = m_cwidth;
-    uint32_t dstStride = dstPicYuv->getCStride();
-    primitives.chroma[m_csp].copy_sp[part](dst, dstStride, src, srcStride);
-}
diff -r 3af58371c5ff -r ed2786407c46 source/common/shortyuv.h
--- a/source/common/shortyuv.h	Tue Jun 24 15:41:55 2014 +0900
+++ b/source/common/shortyuv.h	Tue Jun 24 15:42:36 2014 +0900
@@ -100,7 +100,6 @@
     void copyPartToPartLuma(TComYuv* dstPicYuv, uint32_t partIdx, uint32_t partSize);
     void copyPartToPartLuma(TComYuv* dstPicYuv, uint32_t partIdx, uint32_t width, uint32_t height);
     void copyPartToPartChroma(TComYuv* dstPicYuv, uint32_t partIdx, uint32_t lumaSize, bool bChromaSame);
-    void copyPartToPartYuvChroma(TComYuv* dstPicYuv, uint32_t partIdx, uint32_t lumaSize, uint32_t chromaId, const bool splitIntoSubTUs);
 
     // -------------------------------------------------------------------------------------------------------------------
     // member functions to support multiple color space formats


More information about the x265-devel mailing list