[x265] cleanup m_sharedPredTransformSkip[]

Satoshi Nakagawa nakagawa424 at oki.com
Sat Mar 8 05:21:33 CET 2014


For clarify, default0Save1Load2 is renamed to bReusePred

# HG changeset patch
# User Satoshi Nakagawa <nakagawa424 at oki.com>
# Date 1394251730 -32400
#      Sat Mar 08 13:08:50 2014 +0900
# Node ID ee48209c7d8bb298adc3a0eebc8f4c664ffe0f4c
# Parent  2bf727dca27d6f69e96d4412850661cbe036cbef
cleanup m_sharedPredTransformSkip[]

NEW_CALCRECON macro is TODO mark for asm experts, to optimize register assignment.


diff -r 2bf727dca27d -r ee48209c7d8b source/Lib/TLibEncoder/TEncSearch.cpp
--- a/source/Lib/TLibEncoder/TEncSearch.cpp	Fri Mar 07 15:11:13 2014 +0530
+++ b/source/Lib/TLibEncoder/TEncSearch.cpp	Sat Mar 08 13:08:50 2014 +0900
@@ -64,7 +64,6 @@
     m_qtTempTUCoeffCr = NULL;
     for (int i = 0; i < 3; i++)
     {
-        m_sharedPredTransformSkip[i] = NULL;
         m_qtTempTransformSkipFlag[i] = NULL;
         m_qtTempCbf[i] = NULL;
     }
@@ -97,7 +96,6 @@
     for (uint32_t i = 0; i < 3; ++i)
     {
         X265_FREE(m_qtTempCbf[i]);
-        X265_FREE(m_sharedPredTransformSkip[i]);
         X265_FREE(m_qtTempTransformSkipFlag[i]);
     }
 
@@ -154,9 +152,6 @@
     CHECKED_MALLOC(m_qtTempTransformSkipFlag[1], uint8_t, numPartitions);
     CHECKED_MALLOC(m_qtTempTransformSkipFlag[2], uint8_t, numPartitions);
 
-    CHECKED_MALLOC(m_sharedPredTransformSkip[0], pixel, MAX_TS_WIDTH * MAX_TS_HEIGHT);
-    CHECKED_MALLOC(m_sharedPredTransformSkip[1], pixel, MAX_TS_WIDTH * MAX_TS_HEIGHT);
-    CHECKED_MALLOC(m_sharedPredTransformSkip[2], pixel, MAX_TS_WIDTH * MAX_TS_HEIGHT);
     CHECKED_MALLOC(m_qtTempTUCoeffY, TCoeff, MAX_TS_WIDTH * MAX_TS_HEIGHT);
     CHECKED_MALLOC(m_qtTempTUCoeffCb, TCoeff, MAX_TS_WIDTH * MAX_TS_HEIGHT);
     CHECKED_MALLOC(m_qtTempTUCoeffCr, TCoeff, MAX_TS_WIDTH * MAX_TS_HEIGHT);
@@ -403,11 +398,10 @@
                                      uint32_t    absPartIdx,
                                      TComYuv*    fencYuv,
                                      TComYuv*    predYuv,
-                                     ShortYuv*  resiYuv,
+                                     ShortYuv*   resiYuv,
                                      uint32_t&   outDist,
-                                     int         default0Save1Load2)
+                                     bool        bReusePred)
 {
-    uint32_t lumaPredMode = cu->getLumaIntraDir(absPartIdx);
     uint32_t fullDepth    = cu->getDepth(0)  + trDepth;
     uint32_t width        = cu->getCUSize(0) >> trDepth;
     uint32_t height       = cu->getCUSize(0) >> trDepth;
@@ -415,7 +409,6 @@
     Pel*     fenc         = fencYuv->getLumaAddr(absPartIdx);
     Pel*     pred         = predYuv->getLumaAddr(absPartIdx);
     int16_t* residual     = resiYuv->getLumaAddr(absPartIdx);
-    Pel*     recon        = predYuv->getLumaAddr(absPartIdx);
     int      chFmt        = cu->getChromaFormat();
     int      part         = partitionFromSizes(width, height);
 
@@ -433,22 +426,13 @@
     uint32_t reconIPredStride = cu->getPic()->getPicYuvRec()->getStride();
     bool     useTransformSkip = cu->getTransformSkip(absPartIdx, TEXT_LUMA);
 
-    //===== init availability pattern =====
-
-    if (default0Save1Load2 != 2)
+    if (!bReusePred)
     {
+        //===== init availability pattern =====
         cu->getPattern()->initAdiPattern(cu, absPartIdx, trDepth, m_predBuf, m_predBufStride, m_predBufHeight, m_refAbove, m_refLeft, m_refAboveFlt, m_refLeftFlt);
+        uint32_t lumaPredMode = cu->getLumaIntraDir(absPartIdx);
         //===== get prediction signal =====
         predIntraLumaAng(lumaPredMode, pred, stride, width);
-        // save prediction
-        if (default0Save1Load2 == 1)
-        {
-            primitives.luma_copy_pp[part](m_sharedPredTransformSkip[0], width, pred, stride);
-        }
-    }
-    else
-    {
-        primitives.luma_copy_pp[part](pred, stride, m_sharedPredTransformSkip[0], width);
     }
 
     //===== get residual signal =====
@@ -492,12 +476,19 @@
         primitives.blockfill_s[size](resiTmp, stride, 0);
     }
 
+    assert(width <= 32);
+#if NEW_CALCRECON
     //===== reconstruction =====
-    assert(width <= 32);
+    primitives.calcrecon[size](pred, residual, 0, reconQt, reconIPred, stride, MAX_CU_SIZE, reconIPredStride);
+    //===== update distortion =====
+    outDist += primitives.sse_sp[part](reconQt, MAX_CU_SIZE, fenc, stride);
+#else
+    ALIGN_VAR_32(pixel, recon[MAX_CU_SIZE * MAX_CU_SIZE]);
+    //===== reconstruction =====
     primitives.calcrecon[size](pred, residual, recon, reconQt, reconIPred, stride, MAX_CU_SIZE, reconIPredStride);
-
     //===== update distortion =====
     outDist += primitives.sse_pp[part](fenc, stride, recon, stride);
+#endif
 }
 
 void TEncSearch::xIntraCodingChromaBlk(TComDataCU* cu,
@@ -505,10 +496,10 @@
                                        uint32_t    absPartIdx,
                                        TComYuv*    fencYuv,
                                        TComYuv*    predYuv,
-                                       ShortYuv*  resiYuv,
+                                       ShortYuv*   resiYuv,
                                        uint32_t&   outDist,
                                        uint32_t    chromaId,
-                                       int         default0Save1Load2)
+                                       bool        bReusePred)
 {
     uint32_t origTrDepth = trDepth;
     uint32_t fullDepth   = cu->getDepth(0) + trDepth;
@@ -528,14 +519,12 @@
     }
 
     TextType ttype          = (chromaId > 0 ? TEXT_CHROMA_V : TEXT_CHROMA_U);
-    uint32_t chromaPredMode = cu->getChromaIntraDir(absPartIdx);
     uint32_t width          = cu->getCUSize(0) >> (trDepth + m_hChromaShift);
     uint32_t height         = cu->getCUSize(0) >> (trDepth + m_vChromaShift);
     uint32_t stride         = fencYuv->getCStride();
     Pel*     fenc           = (chromaId > 0 ? fencYuv->getCrAddr(absPartIdx) : fencYuv->getCbAddr(absPartIdx));
     Pel*     pred           = (chromaId > 0 ? predYuv->getCrAddr(absPartIdx) : predYuv->getCbAddr(absPartIdx));
     int16_t* residual       = (chromaId > 0 ? resiYuv->getCrAddr(absPartIdx) : resiYuv->getCbAddr(absPartIdx));
-    Pel*     recon          = (chromaId > 0 ? predYuv->getCrAddr(absPartIdx) : predYuv->getCbAddr(absPartIdx));
 
     uint32_t qtlayer        = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - trSizeLog2;
     uint32_t numCoeffPerInc = (cu->getSlice()->getSPS()->getMaxCUSize() * cu->getSlice()->getSPS()->getMaxCUSize() >> (cu->getSlice()->getSPS()->getMaxCUDepth() << 1)) >> (m_hChromaShift + m_vChromaShift);
@@ -548,33 +537,21 @@
     bool     useTransformSkipChroma = cu->getTransformSkip(absPartIdx, ttype);
     int      part = partitionFromSizes(width, height);
 
-    //===== update chroma mode =====
-    if (chromaPredMode == DM_CHROMA_IDX)
+    if (!bReusePred)
     {
-        chromaPredMode = cu->getLumaIntraDir(absPartIdx);
-    }
-
-    //===== init availability pattern =====
-    if (default0Save1Load2 != 2)
-    {
+        //===== init availability pattern =====
         cu->getPattern()->initAdiPatternChroma(cu, absPartIdx, trDepth, m_predBuf, m_predBufStride, m_predBufHeight, chromaId);
         Pel* chromaPred = TComPattern::getAdiChromaBuf(chromaId, height, m_predBuf);
 
+        uint32_t chromaPredMode = cu->getChromaIntraDir(absPartIdx);
+        //===== update chroma mode =====
+        if (chromaPredMode == DM_CHROMA_IDX)
+        {
+            chromaPredMode = cu->getLumaIntraDir(absPartIdx);
+        }
+
         //===== get prediction signal =====
         predIntraChromaAng(chromaPred, chromaPredMode, pred, stride, width, height, chFmt);
-
-        // save prediction
-        if (default0Save1Load2 == 1)
-        {
-            Pel* predbuf = m_sharedPredTransformSkip[1 + chromaId];
-            primitives.luma_copy_pp[part](predbuf, width, pred, stride);
-        }
-    }
-    else
-    {
-        // load prediction
-        Pel* predbuf = m_sharedPredTransformSkip[1 + chromaId];
-        primitives.luma_copy_pp[part](pred, stride, predbuf, width);
     }
 
     //===== get residual signal =====
@@ -628,12 +605,20 @@
         }
     }
 
+    assert(((intptr_t)residual & (width - 1)) == 0);
+    assert(width <= 32);
+#if NEW_CALCRECON
     //===== reconstruction =====
-    assert(((uint32_t)(size_t)residual & (width - 1)) == 0);
-    assert(width <= 32);
+    primitives.calcrecon[size](pred, residual, 0, reconQt, reconIPred, stride, reconQtStride, reconIPredStride);
+    //===== update distortion =====
+    uint32_t dist = primitives.sse_sp[part](reconQt, reconQtStride, fenc, stride);
+#else
+    ALIGN_VAR_32(pixel, recon[MAX_CU_SIZE * MAX_CU_SIZE]);
+    //===== reconstruction =====
     primitives.calcrecon[size](pred, residual, recon, reconQt, reconIPred, stride, reconQtStride, reconIPredStride);
     //===== update distortion =====
     uint32_t dist = primitives.sse_pp[part](fenc, stride, recon, stride);
+#endif
     if (ttype == TEXT_CHROMA_U)
     {
         outDist += m_rdCost->scaleChromaDistCb(dist);
@@ -720,8 +705,7 @@
             uint32_t singleCbfUTmp      = 0;
             uint32_t singleCbfVTmp      = 0;
             uint64_t singleCostTmp      = 0;
-            int    default0Save1Load2 = 0;
-            int    firstCheckId       = 0;
+            const int firstCheckId      = 0;
 
             uint32_t qpdiv = cu->getPic()->getNumPartInCU() >> ((cu->getDepth(0) + (trDepth - 1)) << 1);
             bool   bFirstQ = ((absPartIdx % qpdiv) == 0);
@@ -731,16 +715,9 @@
                 singleDistYTmp = 0;
                 singleDistCTmp = 0;
                 cu->setTransformSkipSubParts(modeId, TEXT_LUMA, absPartIdx, fullDepth);
-                if (modeId == firstCheckId)
-                {
-                    default0Save1Load2 = 1;
-                }
-                else
-                {
-                    default0Save1Load2 = 2;
-                }
                 //----- code luma block with given intra prediction mode and store Cbf-----
-                xIntraCodingLumaBlk(cu, trDepth, absPartIdx, fencYuv, predYuv, resiYuv, singleDistYTmp, default0Save1Load2);
+                bool bReusePred = modeId != firstCheckId;
+                xIntraCodingLumaBlk(cu, trDepth, absPartIdx, fencYuv, predYuv, resiYuv, singleDistYTmp, bReusePred);
                 singleCbfYTmp = cu->getCbf(absPartIdx, TEXT_LUMA, trDepth);
                 //----- code chroma blocks with given intra prediction mode and store Cbf-----
                 if (!bLumaOnly)
@@ -750,8 +727,8 @@
                         cu->setTransformSkipSubParts(modeId, TEXT_CHROMA_U, absPartIdx, fullDepth);
                         cu->setTransformSkipSubParts(modeId, TEXT_CHROMA_V, absPartIdx, fullDepth);
                     }
-                    xIntraCodingChromaBlk(cu, trDepth, absPartIdx, fencYuv, predYuv, resiYuv, singleDistCTmp, 0, default0Save1Load2);
-                    xIntraCodingChromaBlk(cu, trDepth, absPartIdx, fencYuv, predYuv, resiYuv, singleDistCTmp, 1, default0Save1Load2);
+                    xIntraCodingChromaBlk(cu, trDepth, absPartIdx, fencYuv, predYuv, resiYuv, singleDistCTmp, 0, bReusePred);
+                    xIntraCodingChromaBlk(cu, trDepth, absPartIdx, fencYuv, predYuv, resiYuv, singleDistCTmp, 1, bReusePred);
                     singleCbfUTmp = cu->getCbf(absPartIdx, TEXT_CHROMA_U, trDepth);
                     singleCbfVTmp = cu->getCbf(absPartIdx, TEXT_CHROMA_V, trDepth);
                 }
@@ -1437,22 +1414,14 @@
                 uint64_t singleCostTmp  = 0;
                 uint32_t singleCbfCTmp  = 0;
 
-                int     default0Save1Load2 = 0;
-                int     firstCheckId       = 0;
+                const int firstCheckId  = 0;
 
                 for (int chromaModeId = firstCheckId; chromaModeId < 2; chromaModeId++)
                 {
                     cu->setTransformSkipSubParts(chromaModeId, (TextType)(chromaId + TEXT_CHROMA_U), absPartIdx, cu->getDepth(0) + actualTrDepth);
-                    if (chromaModeId == firstCheckId)
-                    {
-                        default0Save1Load2 = 1;
-                    }
-                    else
-                    {
-                        default0Save1Load2 = 2;
-                    }
                     singleDistCTmp = 0;
-                    xIntraCodingChromaBlk(cu, trDepth, absPartIdx, fencYuv, predYuv, resiYuv, singleDistCTmp, chromaId, default0Save1Load2);
+                    bool bReusePred = chromaModeId != firstCheckId;
+                    xIntraCodingChromaBlk(cu, trDepth, absPartIdx, fencYuv, predYuv, resiYuv, singleDistCTmp, chromaId, bReusePred);
                     singleCbfCTmp = cu->getCbf(absPartIdx, (TextType)(chromaId + TEXT_CHROMA_U), trDepth);
 
                     if (chromaModeId == 1 && singleCbfCTmp == 0)
diff -r 2bf727dca27d -r ee48209c7d8b source/Lib/TLibEncoder/TEncSearch.h
--- a/source/Lib/TLibEncoder/TEncSearch.h	Fri Mar 07 15:11:13 2014 +0530
+++ b/source/Lib/TLibEncoder/TEncSearch.h	Sat Mar 08 13:08:50 2014 +0900
@@ -83,7 +83,6 @@
 protected:
 
     ShortYuv*      m_qtTempShortYuv;
-    pixel*          m_sharedPredTransformSkip[3];
 
     TCoeff**        m_qtTempCoeffY;
     TCoeff**        m_qtTempCoeffCb;
@@ -203,10 +202,10 @@
     uint32_t xGetIntraBitsQT(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, bool bLuma, bool bChroma);
     uint32_t xGetIntraBitsQTChroma(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, uint32_t uiChromaId);
     void xIntraCodingLumaBlk(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, TComYuv* fencYuv, TComYuv* predYuv,
-                             ShortYuv* resiYuv, uint32_t& outDist, int default0Save1Load2 = 0);
+                             ShortYuv* resiYuv, uint32_t& outDist, bool bReusePred = false);
 
     void xIntraCodingChromaBlk(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, TComYuv* fencYuv, TComYuv* predYuv,
-                               ShortYuv* resiYuv, uint32_t& outDist, uint32_t uiChromaId, int default0Save1Load2 = 0);
+                               ShortYuv* resiYuv, uint32_t& outDist, uint32_t uiChromaId, bool bReusePred = false);
 
     void xRecurIntraChromaCodingQT(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, TComYuv* fencYuv,
                                    TComYuv* predYuv, ShortYuv* resiYuv, uint32_t& outDist);
diff -r 2bf727dca27d -r ee48209c7d8b source/common/pixel.cpp
--- a/source/common/pixel.cpp	Fri Mar 07 15:11:13 2014 +0530
+++ b/source/common/pixel.cpp	Sat Mar 08 13:08:50 2014 +0900
@@ -460,20 +460,33 @@
 }
 
 template<int blockSize>
-void calcRecons(pixel* pred, int16_t* residual, pixel* recon, int16_t* recqt, pixel* recipred, int stride, int qtstride, int ipredstride)
+void calcRecons(pixel* pred, int16_t* residual,
+#if NEW_CALCRECON
+                pixel*,
+#else
+                pixel* recon,
+#endif
+                int16_t* recqt, pixel* recipred, int stride, int qtstride, int ipredstride)
 {
     for (int uiY = 0; uiY < blockSize; uiY++)
     {
         for (int uiX = 0; uiX < blockSize; uiX++)
         {
+#if NEW_CALCRECON
+            recqt[uiX] = (int16_t)ClipY(static_cast<int16_t>(pred[uiX]) + residual[uiX]);
+            recipred[uiX] = (pixel)recqt[uiX];
+#else
             recon[uiX] = (pixel)ClipY(static_cast<int16_t>(pred[uiX]) + residual[uiX]);
             recqt[uiX] = (int16_t)recon[uiX];
             recipred[uiX] = recon[uiX];
+#endif
         }
 
         pred += stride;
         residual += stride;
+#if !NEW_CALCRECON
         recon += stride;
+#endif
         recqt += qtstride;
         recipred += ipredstride;
     }
diff -r 2bf727dca27d -r ee48209c7d8b source/common/primitives.h
--- a/source/common/primitives.h	Fri Mar 07 15:11:13 2014 +0530
+++ b/source/common/primitives.h	Sat Mar 08 13:08:50 2014 +0900
@@ -32,6 +32,8 @@
 #include "common.h"
 #include "cpu.h"
 
+#define NEW_CALCRECON 1 // TODO: remove recon[] arg
+
 namespace x265 {
 // x265 private namespace
 
diff -r 2bf727dca27d -r ee48209c7d8b source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Fri Mar 07 15:11:13 2014 +0530
+++ b/source/common/x86/pixel-util8.asm	Sat Mar 08 13:08:50 2014 +0900
@@ -57,6 +57,7 @@
 cextern pw_2000
 cextern pw_pixel_max
 
+%define NEW_CALCRECON 1 ; TODO: remove recon[] arg
 ;-----------------------------------------------------------------------------
 ; void calcrecon(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred)
 ;-----------------------------------------------------------------------------
@@ -101,7 +102,9 @@
     CLIPW       m0, m4, m5
 
     ; store recon[] and recipred[]
+%if NEW_CALCRECON == 0
     movh        [t2], m0
+%endif
     movh        [t4], m0
 %if ARCH_X86_64 == 0
     add         t4, t7
@@ -113,7 +116,9 @@
     movhps      [t4 + t7], m0
     lea         t4, [t4 + t7 * 2]
 %endif
+%if NEW_CALCRECON == 0
     movhps      [t2 + t5], m0
+%endif
 
     ; store recqt[]
     movh        [t3], m0
@@ -123,7 +128,9 @@
 
     lea         t0, [t0 + t5 * 2]
     lea         t1, [t1 + t5 * 2]
+%if NEW_CALCRECON == 0
     lea         t2, [t2 + t5 * 2]
+%endif
 
     dec         t8d
     jnz        .loop
@@ -165,11 +172,15 @@
     packuswb    m1, m1
 
     ; store recon[] and recipred[]
+%if NEW_CALCRECON == 0
     movd        [t2], m1
+%endif
     movd        [t4], m1
     add         t4, t7
     pshufd      m2, m1, 1
+%if NEW_CALCRECON == 0
     movd        [t2 + t5], m2
+%endif
     movd        [t4], m2
     add         t4, t7
 
@@ -182,7 +193,9 @@
 
     lea         t0, [t0 + t5 * 2]
     lea         t1, [t1 + t5 * 4]
+%if NEW_CALCRECON == 0
     lea         t2, [t2 + t5 * 2]
+%endif
 
     dec         t8d
     jnz        .loop
@@ -231,8 +244,10 @@
     CLIPW       m1, m4, m5
 
     ; store recon[] and recipred[]
+%if NEW_CALCRECON == 0
     movu        [t2], m0
     movu        [t2 + t5], m1
+%endif
     movu        [t4], m0
 %if ARCH_X86_64 == 0
     add         t4, t7
@@ -253,7 +268,9 @@
 
     lea         t0, [t0 + t5 * 2]
     lea         t1, [t1 + t5 * 2]
+%if NEW_CALCRECON == 0
     lea         t2, [t2 + t5 * 2]
+%endif
 
     dec         t8d
     jnz        .loop
@@ -295,8 +312,10 @@
     packuswb    m1, m2
 
     ; store recon[] and recipred[]
+%if NEW_CALCRECON == 0
     movlps      [t2], m1
     movhps      [t2 + t5], m1
+%endif
     movlps      [t4], m1
 %if ARCH_X86_64 == 0
     add         t4, t7
@@ -317,7 +336,9 @@
 
     lea         t0, [t0 + t5 * 2]
     lea         t1, [t1 + t5 * 4]
+%if NEW_CALCRECON == 0
     lea         t2, [t2 + t5 * 2]
+%endif
 
     dec         t8d
     jnz        .loop
@@ -367,8 +388,10 @@
     CLIPW       m1, m4, m5
 
     ; store recon[] and recipred[]
+%if NEW_CALCRECON == 0
     movu        [t2], m0
     movu        [t2 + 16], m1
+%endif
     movu        [t4], m0
     movu        [t4 + 16], m1
 %if ARCH_X86_64 == 0
@@ -391,8 +414,10 @@
     CLIPW       m1, m4, m5
 
     ; store recon[] and recipred[]
+%if NEW_CALCRECON == 0
     movu        [t2 + t5], m0
     movu        [t2 + t5 + 16], m1
+%endif
 %if ARCH_X86_64 == 0
     movu        [t4], m0
     movu        [t4 + 16], m1
@@ -411,7 +436,9 @@
 
     lea         t0, [t0 + t5 * 2]
     lea         t1, [t1 + t5 * 2]
+%if NEW_CALCRECON == 0
     lea         t2, [t2 + t5 * 2]
+%endif
 
     dec         t8d
     jnz        .loop
@@ -451,7 +478,9 @@
     packuswb    m1, m2
 
     ; store recon[] and recipred[]
+%if NEW_CALCRECON == 0
     movu        [t2], m1
+%endif
     movu        [t4], m1
 
     ; store recqt[]
@@ -464,7 +493,9 @@
     add         t4, t7
     add         t0, t5
     lea         t1, [t1 + t5 * 2]
+%if NEW_CALCRECON == 0
     add         t2, t5
+%endif
 
     dec         t8d
     jnz        .loop
@@ -513,8 +544,10 @@
     CLIPW       m1, m4, m5
 
     ; store recon[] and recipred[]
+%if NEW_CALCRECON == 0
     movu        [t2], m0
     movu        [t2 + 16], m1
+%endif
     movu        [t4], m0
     movu        [t4 + 16], m1
 
@@ -532,8 +565,10 @@
     CLIPW       m1, m4, m5
 
     ; store recon[] and recipred[]
+%if NEW_CALCRECON == 0
     movu        [t2 + 32], m0
     movu        [t2 + 48], m1
+%endif
     movu        [t4 + 32], m0
     movu        [t4 + 48], m1
 %if ARCH_X86_64 == 0
@@ -556,8 +591,10 @@
     CLIPW       m1, m4, m5
 
     ; store recon[] and recipred[]
+%if NEW_CALCRECON == 0
     movu        [t2 + t5], m0
     movu        [t2 + t5 + 16], m1
+%endif
 %if ARCH_X86_64 == 0
     movu        [t4], m0
     movu        [t4 + 16], m1
@@ -580,8 +617,10 @@
     CLIPW       m1, m4, m5
 
     ; store recon[] and recipred[]
+%if NEW_CALCRECON == 0
     movu        [t2 + t5 + 32], m0
     movu        [t2 + t5 + 48], m1
+%endif
 %if ARCH_X86_64 == 0
     movu        [t4 + 32], m0
     movu        [t4 + 48], m1
@@ -600,7 +639,9 @@
 
     lea         t0, [t0 + t5 * 2]
     lea         t1, [t1 + t5 * 2]
+%if NEW_CALCRECON == 0
     lea         t2, [t2 + t5 * 2]
+%endif
 
     dec         t8d
     jnz        .loop
@@ -648,8 +689,10 @@
     packuswb    m3, m4
 
     ; store recon[] and recipred[]
+%if NEW_CALCRECON == 0
     movu        [t2], m1
     movu        [t2 + 16], m3
+%endif
     movu        [t4], m1
     movu        [t4 + 16], m3
 
@@ -667,7 +710,9 @@
     add         t4, t7
     add         t0, t5
     lea         t1, [t1 + t5 * 2]
+%if NEW_CALCRECON == 0
     add         t2, t5
+%endif
 
     dec         t8d
     jnz        .loop
diff -r 2bf727dca27d -r ee48209c7d8b source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp	Fri Mar 07 15:11:13 2014 +0530
+++ b/source/test/pixelharness.cpp	Sat Mar 08 13:08:50 2014 +0900
@@ -347,10 +347,12 @@
         {
             return false;
         }
+#if !NEW_CALCRECON
         if (memcmp(ref_reco, opt_reco, 64 * 64 * sizeof(pixel)))
         {
             return false;
         }
+#endif
         if (memcmp(ref_pred, opt_pred, 64 * 64 * sizeof(pixel)))
         {
             return false;


More information about the x265-devel mailing list