[x265] cleanup m_sharedPredTransformSkip[]

Satoshi Nakagawa nakagawa424 at oki.com
Tue Mar 4 11:40:23 CET 2014


# HG changeset patch
# User Satoshi Nakagawa <nakagawa424 at oki.com>
# Date 1393929339 -32400
#      Tue Mar 04 19:35:39 2014 +0900
# Node ID 7a61566806f691ddff84cbbc42801f6c2d46df88
# Parent  3cbde0b893e34e5770cc311d3f4b6fe064c27774
cleanup m_sharedPredTransformSkip[]

NEW_CALCRECON macro is TODO mark for asm experts, to optimize register assignment.


diff -r 3cbde0b893e3 -r 7a61566806f6 source/Lib/TLibEncoder/TEncSearch.cpp
--- a/source/Lib/TLibEncoder/TEncSearch.cpp	Mon Mar 03 13:37:35 2014 -0600
+++ b/source/Lib/TLibEncoder/TEncSearch.cpp	Tue Mar 04 19:35:39 2014 +0900
@@ -63,7 +63,6 @@
     m_qtTempTUCoeffCr = NULL;
     for (int i = 0; i < 3; i++)
     {
-        m_sharedPredTransformSkip[i] = NULL;
         m_qtTempTransformSkipFlag[i] = NULL;
         m_qtTempCbf[i] = NULL;
     }
@@ -96,7 +95,6 @@
     for (uint32_t i = 0; i < 3; ++i)
     {
         X265_FREE(m_qtTempCbf[i]);
-        X265_FREE(m_sharedPredTransformSkip[i]);
         X265_FREE(m_qtTempTransformSkipFlag[i]);
     }
 
@@ -153,9 +151,6 @@
     CHECKED_MALLOC(m_qtTempTransformSkipFlag[1], uint8_t, numPartitions);
     CHECKED_MALLOC(m_qtTempTransformSkipFlag[2], uint8_t, numPartitions);
 
-    CHECKED_MALLOC(m_sharedPredTransformSkip[0], pixel, MAX_TS_WIDTH * MAX_TS_HEIGHT);
-    CHECKED_MALLOC(m_sharedPredTransformSkip[1], pixel, MAX_TS_WIDTH * MAX_TS_HEIGHT);
-    CHECKED_MALLOC(m_sharedPredTransformSkip[2], pixel, MAX_TS_WIDTH * MAX_TS_HEIGHT);
     CHECKED_MALLOC(m_qtTempTUCoeffY, TCoeff, MAX_TS_WIDTH * MAX_TS_HEIGHT);
     CHECKED_MALLOC(m_qtTempTUCoeffCb, TCoeff, MAX_TS_WIDTH * MAX_TS_HEIGHT);
     CHECKED_MALLOC(m_qtTempTUCoeffCr, TCoeff, MAX_TS_WIDTH * MAX_TS_HEIGHT);
@@ -414,7 +409,6 @@
     Pel*     fenc         = fencYuv->getLumaAddr(absPartIdx);
     Pel*     pred         = predYuv->getLumaAddr(absPartIdx);
     int16_t* residual     = resiYuv->getLumaAddr(absPartIdx);
-    Pel*     recon        = predYuv->getLumaAddr(absPartIdx);
     int      chFmt        = cu->getChromaFormat();
     int      part         = partitionFromSizes(width, height);
 
@@ -439,15 +433,6 @@
         cu->getPattern()->initAdiPattern(cu, absPartIdx, trDepth, m_predBuf, m_predBufStride, m_predBufHeight, m_refAbove, m_refLeft, m_refAboveFlt, m_refLeftFlt);
         //===== get prediction signal =====
         predIntraLumaAng(lumaPredMode, pred, stride, width);
-        // save prediction
-        if (default0Save1Load2 == 1)
-        {
-            primitives.luma_copy_pp[part](m_sharedPredTransformSkip[0], width, pred, stride);
-        }
-    }
-    else
-    {
-        primitives.luma_copy_pp[part](pred, stride, m_sharedPredTransformSkip[0], width);
     }
 
     //===== get residual signal =====
@@ -491,12 +476,19 @@
         primitives.blockfill_s[size](resiTmp, stride, 0);
     }
 
+    assert(width <= 32);
+#if NEW_CALCRECON
     //===== reconstruction =====
-    assert(width <= 32);
+    primitives.calcrecon[size](pred, residual, 0, reconQt, reconIPred, stride, MAX_CU_SIZE, reconIPredStride);
+    //===== update distortion =====
+    outDist += primitives.sse_sp[part](reconQt, MAX_CU_SIZE, fenc, stride);
+#else
+    ALIGN_VAR_32(pixel, recon[MAX_CU_SIZE * MAX_CU_SIZE]);
+    //===== reconstruction =====
     primitives.calcrecon[size](pred, residual, recon, reconQt, reconIPred, stride, MAX_CU_SIZE, reconIPredStride);
-
     //===== update distortion =====
     outDist += primitives.sse_pp[part](fenc, stride, recon, stride);
+#endif
 }
 
 void TEncSearch::xIntraCodingChromaBlk(TComDataCU* cu,
@@ -534,7 +526,6 @@
     Pel*     fenc           = (chromaId > 0 ? fencYuv->getCrAddr(absPartIdx) : fencYuv->getCbAddr(absPartIdx));
     Pel*     pred           = (chromaId > 0 ? predYuv->getCrAddr(absPartIdx) : predYuv->getCbAddr(absPartIdx));
     int16_t* residual       = (chromaId > 0 ? resiYuv->getCrAddr(absPartIdx) : resiYuv->getCbAddr(absPartIdx));
-    Pel*     recon          = (chromaId > 0 ? predYuv->getCrAddr(absPartIdx) : predYuv->getCbAddr(absPartIdx));
 
     uint32_t qtlayer        = cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - trSizeLog2;
     uint32_t numCoeffPerInc = (cu->getSlice()->getSPS()->getMaxCUWidth() * cu->getSlice()->getSPS()->getMaxCUHeight() >> (cu->getSlice()->getSPS()->getMaxCUDepth() << 1)) >> (m_hChromaShift + m_vChromaShift);
@@ -561,19 +552,6 @@
 
         //===== get prediction signal =====
         predIntraChromaAng(chromaPred, chromaPredMode, pred, stride, width, height, chFmt);
-
-        // save prediction
-        if (default0Save1Load2 == 1)
-        {
-            Pel* predbuf = m_sharedPredTransformSkip[1 + chromaId];
-            primitives.luma_copy_pp[part](predbuf, width, pred, stride);
-        }
-    }
-    else
-    {
-        // load prediction
-        Pel* predbuf = m_sharedPredTransformSkip[1 + chromaId];
-        primitives.luma_copy_pp[part](pred, stride, predbuf, width);
     }
 
     //===== get residual signal =====
@@ -627,12 +605,20 @@
         }
     }
 
+    assert(((intptr_t)residual & (width - 1)) == 0);
+    assert(width <= 32);
+#if NEW_CALCRECON
     //===== reconstruction =====
-    assert(((uint32_t)(size_t)residual & (width - 1)) == 0);
-    assert(width <= 32);
+    primitives.calcrecon[size](pred, residual, 0, reconQt, reconIPred, stride, reconQtStride, reconIPredStride);
+    //===== update distortion =====
+    uint32_t dist = primitives.sse_sp[part](reconQt, reconQtStride, fenc, stride);
+#else
+    ALIGN_VAR_32(pixel, recon[MAX_CU_SIZE * MAX_CU_SIZE]);
+    //===== reconstruction =====
     primitives.calcrecon[size](pred, residual, recon, reconQt, reconIPred, stride, reconQtStride, reconIPredStride);
     //===== update distortion =====
     uint32_t dist = primitives.sse_pp[part](fenc, stride, recon, stride);
+#endif
     if (ttype == TEXT_CHROMA_U)
     {
         outDist += m_rdCost->scaleChromaDistCb(dist);
diff -r 3cbde0b893e3 -r 7a61566806f6 source/Lib/TLibEncoder/TEncSearch.h
--- a/source/Lib/TLibEncoder/TEncSearch.h	Mon Mar 03 13:37:35 2014 -0600
+++ b/source/Lib/TLibEncoder/TEncSearch.h	Tue Mar 04 19:35:39 2014 +0900
@@ -84,7 +84,6 @@
 protected:
 
     ShortYuv*      m_qtTempShortYuv;
-    pixel*          m_sharedPredTransformSkip[3];
 
     TCoeff**        m_qtTempCoeffY;
     TCoeff**        m_qtTempCoeffCb;
diff -r 3cbde0b893e3 -r 7a61566806f6 source/common/pixel.cpp
--- a/source/common/pixel.cpp	Mon Mar 03 13:37:35 2014 -0600
+++ b/source/common/pixel.cpp	Tue Mar 04 19:35:39 2014 +0900
@@ -460,20 +460,33 @@
 }
 
 template<int blockSize>
-void calcRecons(pixel* pred, int16_t* residual, pixel* recon, int16_t* recqt, pixel* recipred, int stride, int qtstride, int ipredstride)
+void calcRecons(pixel* pred, int16_t* residual,
+#if NEW_CALCRECON
+                pixel*,
+#else
+                pixel* recon,
+#endif
+                int16_t* recqt, pixel* recipred, int stride, int qtstride, int ipredstride)
 {
     for (int uiY = 0; uiY < blockSize; uiY++)
     {
         for (int uiX = 0; uiX < blockSize; uiX++)
         {
+#if NEW_CALCRECON
+            recqt[uiX] = (int16_t)ClipY(static_cast<int16_t>(pred[uiX]) + residual[uiX]);
+            recipred[uiX] = (pixel)recqt[uiX];
+#else
             recon[uiX] = (pixel)ClipY(static_cast<int16_t>(pred[uiX]) + residual[uiX]);
             recqt[uiX] = (int16_t)recon[uiX];
             recipred[uiX] = recon[uiX];
+#endif
         }
 
         pred += stride;
         residual += stride;
+#if !NEW_CALCRECON
         recon += stride;
+#endif
         recqt += qtstride;
         recipred += ipredstride;
     }
diff -r 3cbde0b893e3 -r 7a61566806f6 source/common/primitives.h
--- a/source/common/primitives.h	Mon Mar 03 13:37:35 2014 -0600
+++ b/source/common/primitives.h	Tue Mar 04 19:35:39 2014 +0900
@@ -34,6 +34,8 @@
 #include "cpu.h"
 #include "x265.h"
 
+#define NEW_CALCRECON 1 // TODO: remove recon[] arg
+
 #define FENC_STRIDE 64
 
 #define NUM_INTRA_MODE 35
diff -r 3cbde0b893e3 -r 7a61566806f6 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Mon Mar 03 13:37:35 2014 -0600
+++ b/source/common/x86/pixel-util8.asm	Tue Mar 04 19:35:39 2014 +0900
@@ -57,6 +57,7 @@
 cextern pw_2000
 cextern pw_pixel_max
 
+%define NEW_CALCRECON 1 ; TODO: remove recon[] arg
 ;-----------------------------------------------------------------------------
 ; void calcrecon(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred)
 ;-----------------------------------------------------------------------------
@@ -101,7 +102,9 @@
     CLIPW       m0, m4, m5
 
     ; store recon[] and recipred[]
+%if NEW_CALCRECON == 0
     movh        [t2], m0
+%endif
     movh        [t4], m0
 %if ARCH_X86_64 == 0
     add         t4, t7
@@ -113,7 +116,9 @@
     movhps      [t4 + t7], m0
     lea         t4, [t4 + t7 * 2]
 %endif
+%if NEW_CALCRECON == 0
     movhps      [t2 + t5], m0
+%endif
 
     ; store recqt[]
     movh        [t3], m0
@@ -123,7 +128,9 @@
 
     lea         t0, [t0 + t5 * 2]
     lea         t1, [t1 + t5 * 2]
+%if NEW_CALCRECON == 0
     lea         t2, [t2 + t5 * 2]
+%endif
 
     dec         t8d
     jnz        .loop
@@ -165,11 +172,15 @@
     packuswb    m1, m1
 
     ; store recon[] and recipred[]
+%if NEW_CALCRECON == 0
     movd        [t2], m1
+%endif
     movd        [t4], m1
     add         t4, t7
     pshufd      m2, m1, 1
+%if NEW_CALCRECON == 0
     movd        [t2 + t5], m2
+%endif
     movd        [t4], m2
     add         t4, t7
 
@@ -182,7 +193,9 @@
 
     lea         t0, [t0 + t5 * 2]
     lea         t1, [t1 + t5 * 4]
+%if NEW_CALCRECON == 0
     lea         t2, [t2 + t5 * 2]
+%endif
 
     dec         t8d
     jnz        .loop
@@ -231,8 +244,10 @@
     CLIPW       m1, m4, m5
 
     ; store recon[] and recipred[]
+%if NEW_CALCRECON == 0
     movu        [t2], m0
     movu        [t2 + t5], m1
+%endif
     movu        [t4], m0
 %if ARCH_X86_64 == 0
     add         t4, t7
@@ -253,7 +268,9 @@
 
     lea         t0, [t0 + t5 * 2]
     lea         t1, [t1 + t5 * 2]
+%if NEW_CALCRECON == 0
     lea         t2, [t2 + t5 * 2]
+%endif
 
     dec         t8d
     jnz        .loop
@@ -295,8 +312,10 @@
     packuswb    m1, m2
 
     ; store recon[] and recipred[]
+%if NEW_CALCRECON == 0
     movlps      [t2], m1
     movhps      [t2 + t5], m1
+%endif
     movlps      [t4], m1
 %if ARCH_X86_64 == 0
     add         t4, t7
@@ -317,7 +336,9 @@
 
     lea         t0, [t0 + t5 * 2]
     lea         t1, [t1 + t5 * 4]
+%if NEW_CALCRECON == 0
     lea         t2, [t2 + t5 * 2]
+%endif
 
     dec         t8d
     jnz        .loop
@@ -367,8 +388,10 @@
     CLIPW       m1, m4, m5
 
     ; store recon[] and recipred[]
+%if NEW_CALCRECON == 0
     movu        [t2], m0
     movu        [t2 + 16], m1
+%endif
     movu        [t4], m0
     movu        [t4 + 16], m1
 %if ARCH_X86_64 == 0
@@ -391,8 +414,10 @@
     CLIPW       m1, m4, m5
 
     ; store recon[] and recipred[]
+%if NEW_CALCRECON == 0
     movu        [t2 + t5], m0
     movu        [t2 + t5 + 16], m1
+%endif
 %if ARCH_X86_64 == 0
     movu        [t4], m0
     movu        [t4 + 16], m1
@@ -411,7 +436,9 @@
 
     lea         t0, [t0 + t5 * 2]
     lea         t1, [t1 + t5 * 2]
+%if NEW_CALCRECON == 0
     lea         t2, [t2 + t5 * 2]
+%endif
 
     dec         t8d
     jnz        .loop
@@ -451,7 +478,9 @@
     packuswb    m1, m2
 
     ; store recon[] and recipred[]
+%if NEW_CALCRECON == 0
     movu        [t2], m1
+%endif
     movu        [t4], m1
 
     ; store recqt[]
@@ -464,7 +493,9 @@
     add         t4, t7
     add         t0, t5
     lea         t1, [t1 + t5 * 2]
+%if NEW_CALCRECON == 0
     add         t2, t5
+%endif
 
     dec         t8d
     jnz        .loop
@@ -513,8 +544,10 @@
     CLIPW       m1, m4, m5
 
     ; store recon[] and recipred[]
+%if NEW_CALCRECON == 0
     movu        [t2], m0
     movu        [t2 + 16], m1
+%endif
     movu        [t4], m0
     movu        [t4 + 16], m1
 
@@ -532,8 +565,10 @@
     CLIPW       m1, m4, m5
 
     ; store recon[] and recipred[]
+%if NEW_CALCRECON == 0
     movu        [t2 + 32], m0
     movu        [t2 + 48], m1
+%endif
     movu        [t4 + 32], m0
     movu        [t4 + 48], m1
 %if ARCH_X86_64 == 0
@@ -556,8 +591,10 @@
     CLIPW       m1, m4, m5
 
     ; store recon[] and recipred[]
+%if NEW_CALCRECON == 0
     movu        [t2 + t5], m0
     movu        [t2 + t5 + 16], m1
+%endif
 %if ARCH_X86_64 == 0
     movu        [t4], m0
     movu        [t4 + 16], m1
@@ -580,8 +617,10 @@
     CLIPW       m1, m4, m5
 
     ; store recon[] and recipred[]
+%if NEW_CALCRECON == 0
     movu        [t2 + t5 + 32], m0
     movu        [t2 + t5 + 48], m1
+%endif
 %if ARCH_X86_64 == 0
     movu        [t4 + 32], m0
     movu        [t4 + 48], m1
@@ -600,7 +639,9 @@
 
     lea         t0, [t0 + t5 * 2]
     lea         t1, [t1 + t5 * 2]
+%if NEW_CALCRECON == 0
     lea         t2, [t2 + t5 * 2]
+%endif
 
     dec         t8d
     jnz        .loop
@@ -648,8 +689,10 @@
     packuswb    m3, m4
 
     ; store recon[] and recipred[]
+%if NEW_CALCRECON == 0
     movu        [t2], m1
     movu        [t2 + 16], m3
+%endif
     movu        [t4], m1
     movu        [t4 + 16], m3
 
@@ -667,7 +710,9 @@
     add         t4, t7
     add         t0, t5
     lea         t1, [t1 + t5 * 2]
+%if NEW_CALCRECON == 0
     add         t2, t5
+%endif
 
     dec         t8d
     jnz        .loop
diff -r 3cbde0b893e3 -r 7a61566806f6 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp	Mon Mar 03 13:37:35 2014 -0600
+++ b/source/test/pixelharness.cpp	Tue Mar 04 19:35:39 2014 +0900
@@ -351,10 +351,12 @@
         {
             return false;
         }
+#if !NEW_CALCRECON
         if (memcmp(ref_reco, opt_reco, 64 * 64 * sizeof(pixel)))
         {
             return false;
         }
+#endif
         if (memcmp(ref_pred, opt_pred, 64 * 64 * sizeof(pixel)))
         {
             return false;


More information about the x265-devel mailing list