[x265] [PATCH 1 of 2] intra: do not use C references for intra all-angs

Steve Borho steve at borho.org
Thu Jan 8 05:59:24 CET 2015


# HG changeset patch
# User Steve Borho <steve at borho.org>
# Date 1420689537 -19800
#      Thu Jan 08 09:28:57 2015 +0530
# Node ID ad1beedf5dd689e1d59d5d41e8e9b53d87d4c75b
# Parent  c4df42f39c2e796ce19b8c3b73969af0230609a1
intra: do not use C references for intra all-angs

For platforms which do not have SIMD optimized all-angs functions, it is much
better to call the single angular functions as-needed.  The C reference for this
primitive is trying to match the SIMD outputs, and does many redundant
transposes.

diff -r c4df42f39c2e -r ad1beedf5dd6 source/common/predict.cpp
--- a/source/common/predict.cpp	Wed Jan 07 18:01:59 2015 +0530
+++ b/source/common/predict.cpp	Thu Jan 08 09:28:57 2015 +0530
@@ -86,16 +86,16 @@
     pixel* refLft;
     pixel* refAbv;
 
-    if (!(g_intraFilterFlags[dirMode] & tuSize))
+    if (g_intraFilterFlags[dirMode] & tuSize)
+    {
+        refLft = m_refLeftFlt + tuSize - 1;
+        refAbv = m_refAboveFlt + tuSize - 1;
+    }
+    else
     {
         refLft = m_refLeft + tuSize - 1;
         refAbv = m_refAbove + tuSize - 1;
     }
-    else
-    {
-        refLft = m_refLeftFlt + tuSize - 1;
-        refAbv = m_refAboveFlt + tuSize - 1;
-    }
 
     bool bFilter = log2TrSize <= 4;
     int sizeIdx = log2TrSize - 2;
diff -r c4df42f39c2e -r ad1beedf5dd6 source/common/primitives.cpp
--- a/source/common/primitives.cpp	Wed Jan 07 18:01:59 2015 +0530
+++ b/source/common/primitives.cpp	Thu Jan 08 09:28:57 2015 +0530
@@ -162,6 +162,12 @@
     {
         Setup_C_Primitives(primitives);
 
+        /* We do not want the encoder to use the un-optimized intra all-angles
+         * C references. It is better to call the individual angle functions
+         * instead. We must check for NULL before using this primitive */
+        for (int i = 0; i < NUM_TR_SIZE; i++)
+            primitives.intra_pred_allangs[i] = NULL;
+
 #if ENABLE_ASSEMBLY
         Setup_Instrinsic_Primitives(primitives, cpuid);
         Setup_Assembly_Primitives(primitives, cpuid);
diff -r c4df42f39c2e -r ad1beedf5dd6 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Jan 07 18:01:59 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Thu Jan 08 09:28:57 2015 +0530
@@ -1246,32 +1246,6 @@
 namespace x265 {
 // private x265 namespace
 
-#if HIGH_BIT_DEPTH
-/* Very similar to CRef in intrapred.cpp, except it uses optimized primitives */
-template<int log2Size>
-void intra_allangs(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, int bLuma)
-{
-    const int size = 1 << log2Size;
-    const int sizeIdx = log2Size - 2;
-    ALIGN_VAR_32(pixel, buffer[32 * 32]);
-
-    for (int mode = 2; mode <= 34; mode++)
-    {
-        pixel *left  = (g_intraFilterFlags[mode] & size ? left1  : left0);
-        pixel *above = (g_intraFilterFlags[mode] & size ? above1 : above0);
-        pixel *out = dest + ((mode - 2) << (log2Size * 2));
-
-        if (mode < 18)
-        {
-            primitives.intra_pred[mode][sizeIdx](buffer, size, left, above, mode, bLuma);
-            primitives.transpose[sizeIdx](out, buffer, size);
-        }
-        else
-            primitives.intra_pred[mode][sizeIdx](out, size, left, above, mode, bLuma);
-    }
-}
-#endif
-
 void Setup_Assembly_Primitives(EncoderPrimitives &p, int cpuMask)
 {
 #if HIGH_BIT_DEPTH
@@ -1521,14 +1495,6 @@
         p.chroma[X265_CSP_I422].copy_pp[i] = (copy_pp_t)p.chroma[X265_CSP_I422].copy_ss[i];
     }
 
-    if (p.intra_pred[0][0] && p.transpose[0])
-    {
-        p.intra_pred_allangs[BLOCK_4x4] = intra_allangs<2>;
-        p.intra_pred_allangs[BLOCK_8x8] = intra_allangs<3>;
-        p.intra_pred_allangs[BLOCK_16x16] = intra_allangs<4>;
-        p.intra_pred_allangs[BLOCK_32x32] = intra_allangs<5>;
-    }
-
 #else // if HIGH_BIT_DEPTH
     if (cpuMask & X265_CPU_SSE2)
     {
diff -r c4df42f39c2e -r ad1beedf5dd6 source/encoder/search.cpp
--- a/source/encoder/search.cpp	Wed Jan 07 18:01:59 2015 +0530
+++ b/source/encoder/search.cpp	Thu Jan 08 09:28:57 2015 +0530
@@ -1290,22 +1290,36 @@
     cost = m_rdCost.calcRdSADCost(sad, bits);
     COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits);
 
-    // Transpose NxN
-    primitives.transpose[sizeIdx](bufTrans, fenc, scaleStride);
-
-    primitives.intra_pred_allangs[sizeIdx](tmp, above, left, aboveFiltered, leftFiltered, (scaleTuSize <= 16));
+    bool allangs = true;
+    if (primitives.intra_pred_allangs[sizeIdx])
+    {
+        primitives.transpose[sizeIdx](bufTrans, fenc, scaleStride);
+        primitives.intra_pred_allangs[sizeIdx](tmp, above, left, aboveFiltered, leftFiltered, (scaleTuSize <= 16));
+    }
+    else
+        allangs = false;
 
     bool modeHor;
     const pixel* cmp;
     intptr_t srcStride;
 
 #define TRY_ANGLE(angle) \
-    modeHor = angle < 18; \
-    cmp = modeHor ? bufTrans : fenc; \
-    srcStride = modeHor ? scaleTuSize : scaleStride; \
-    sad = sa8d(cmp, srcStride, &tmp[(angle - 2) * predsize], scaleTuSize) << costShift; \
-    bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(preds, angle) : rbits; \
-    cost = m_rdCost.calcRdSADCost(sad, bits)
+    if (allangs) { \
+        modeHor = angle < 18; \
+        cmp = modeHor ? bufTrans : fenc; \
+        srcStride = modeHor ? scaleTuSize : scaleStride; \
+        sad = sa8d(cmp, srcStride, &tmp[(angle - 2) * predsize], scaleTuSize) << costShift; \
+        bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(preds, angle) : rbits; \
+        cost = m_rdCost.calcRdSADCost(sad, bits); \
+    } else { \
+        if (g_intraFilterFlags[angle] & scaleTuSize) \
+            primitives.intra_pred[angle][sizeIdx](tmp, scaleTuSize, leftFiltered, aboveFiltered, mode, scaleTuSize <= 16); \
+        else \
+            primitives.intra_pred[angle][sizeIdx](tmp, scaleTuSize, left, above, mode, scaleTuSize <= 16); \
+        sad = sa8d(cmp, srcStride, tmp, scaleTuSize) << costShift; \
+        bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(preds, angle) : rbits; \
+        cost = m_rdCost.calcRdSADCost(sad, bits); \
+    }
 
     if (m_param->bEnableFastIntra)
     {
@@ -1520,18 +1534,34 @@
             COPY1_IF_LT(bcost, modeCosts[PLANAR_IDX]);
 
             // angular predictions
-            primitives.intra_pred_allangs[sizeIdx](tmp, above, left, aboveFiltered, leftFiltered, (scaleTuSize <= 16));
-
-            primitives.transpose[sizeIdx](buf_trans, fenc, scaleStride);
-            for (int mode = 2; mode < 35; mode++)
+            if (primitives.intra_pred_allangs[sizeIdx])
             {
-                bool modeHor = (mode < 18);
-                const pixel* cmp = (modeHor ? buf_trans : fenc);
-                intptr_t srcStride = (modeHor ? scaleTuSize : scaleStride);
-                bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits;
-                sad = sa8d(cmp, srcStride, &tmp[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift;
-                modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits);
-                COPY1_IF_LT(bcost, modeCosts[mode]);
+                primitives.intra_pred_allangs[sizeIdx](tmp, above, left, aboveFiltered, leftFiltered, (scaleTuSize <= 16));
+                primitives.transpose[sizeIdx](buf_trans, fenc, scaleStride);
+                for (int mode = 2; mode < 35; mode++)
+                {
+                    bool modeHor = (mode < 18);
+                    const pixel* cmp = (modeHor ? buf_trans : fenc);
+                    intptr_t srcStride = (modeHor ? scaleTuSize : scaleStride);
+                    bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits;
+                    sad = sa8d(cmp, srcStride, &tmp[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift;
+                    modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits);
+                    COPY1_IF_LT(bcost, modeCosts[mode]);
+                }
+            }
+            else
+            {
+                for (int mode = 2; mode < 35; mode++)
+                {
+                    if (g_intraFilterFlags[mode] & scaleTuSize)
+                        primitives.intra_pred[mode][sizeIdx](tmp, scaleTuSize, leftFiltered, aboveFiltered, mode, scaleTuSize <= 16);
+                    else
+                        primitives.intra_pred[mode][sizeIdx](tmp, scaleTuSize, left, above, mode, scaleTuSize <= 16);
+                    bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits;
+                    sad = sa8d(fenc, scaleStride, tmp, scaleTuSize) << costShift;
+                    modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits);
+                    COPY1_IF_LT(bcost, modeCosts[mode]);
+                }
             }
 
             /* Find the top maxCandCount candidate modes with cost within 25% of best
diff -r c4df42f39c2e -r ad1beedf5dd6 source/encoder/slicetype.cpp
--- a/source/encoder/slicetype.cpp	Wed Jan 07 18:01:59 2015 +0530
+++ b/source/encoder/slicetype.cpp	Thu Jan 08 09:28:57 2015 +0530
@@ -1713,40 +1713,76 @@
         cost = m_me.bufSATD(m_predictions, cuSize);
         if (cost < icost)
             icost = cost;
-        primitives.intra_pred_allangs[sizeIdx](m_predictions + 2 * predsize, above0, left0, above1, left1, (cuSize <= 16));
 
-        // calculate satd costs, keep least cost
-        ALIGN_VAR_32(pixel, buf_trans[32 * 32]);
-        primitives.transpose[sizeIdx](buf_trans, m_me.fencPUYuv.m_buf[0], FENC_STRIDE);
+        uint32_t mode, lowmode = 4;
+        if (primitives.intra_pred_allangs[sizeIdx])
+        {
+            ALIGN_VAR_32(pixel, buf_trans[32 * 32]);
 
-        int acost = m_me.COST_MAX;
-        uint32_t mode, lowmode = 4;
-        for (mode = 5; mode < 35; mode += 5)
+            primitives.intra_pred_allangs[sizeIdx](m_predictions + 2 * predsize, above0, left0, above1, left1, (cuSize <= 16));
+            primitives.transpose[sizeIdx](buf_trans, m_me.fencPUYuv.m_buf[0], FENC_STRIDE);
+
+            int acost = m_me.COST_MAX;
+            for (mode = 5; mode < 35; mode += 5)
+            {
+                if (mode < 18)
+                    cost = satd(buf_trans, cuSize, &m_predictions[mode * predsize], cuSize);
+                else
+                    cost = m_me.bufSATD(&m_predictions[mode * predsize], cuSize);
+                COPY2_IF_LT(acost, cost, lowmode, mode);
+            }
+            for (uint32_t dist = 2; dist >= 1; dist--)
+            {
+                mode = lowmode - dist;
+                if (mode < 18)
+                    cost = satd(buf_trans, cuSize, &m_predictions[mode * predsize], cuSize);
+                else
+                    cost = m_me.bufSATD(&m_predictions[mode * predsize], cuSize);
+                COPY2_IF_LT(acost, cost, lowmode, mode);
+
+                mode = lowmode + dist;
+                if (mode < 18)
+                    cost = satd(buf_trans, cuSize, &m_predictions[mode * predsize], cuSize);
+                else
+                    cost = m_me.bufSATD(&m_predictions[mode * predsize], cuSize);
+                COPY2_IF_LT(acost, cost, lowmode, mode);
+            }
+            if (acost < icost)
+                icost = acost;
+        }
+        else
         {
-            if (mode < 18)
-                cost = satd(buf_trans, cuSize, &m_predictions[mode * predsize], cuSize);
-            else
-                cost = m_me.bufSATD(&m_predictions[mode * predsize], cuSize);
-            COPY2_IF_LT(acost, cost, lowmode, mode);
+            int acost = m_me.COST_MAX;
+            for (mode = 5; mode < 35; mode += 5)
+            {
+                if (g_intraFilterFlags[mode] & cuSize)
+                    primitives.intra_pred[mode][sizeIdx](m_predictions, cuSize, left1, above1, mode, cuSize <= 16);
+                else
+                    primitives.intra_pred[mode][sizeIdx](m_predictions, cuSize, left0, above0, mode, cuSize <= 16);
+                cost = m_me.bufSATD(m_predictions, cuSize);
+                COPY2_IF_LT(acost, cost, lowmode, mode);
+            }
+            for (uint32_t dist = 2; dist >= 1; dist--)
+            {
+                mode = lowmode - dist;
+                if (g_intraFilterFlags[mode] & cuSize)
+                    primitives.intra_pred[mode][sizeIdx](m_predictions, cuSize, left1, above1, mode, cuSize <= 16);
+                else
+                    primitives.intra_pred[mode][sizeIdx](m_predictions, cuSize, left0, above0, mode, cuSize <= 16);
+                cost = m_me.bufSATD(m_predictions, cuSize);
+                COPY2_IF_LT(acost, cost, lowmode, mode);
+
+                mode = lowmode + dist;
+                if (g_intraFilterFlags[mode] & cuSize)
+                    primitives.intra_pred[mode][sizeIdx](m_predictions, cuSize, left1, above1, mode, cuSize <= 16);
+                else
+                    primitives.intra_pred[mode][sizeIdx](m_predictions, cuSize, left0, above0, mode, cuSize <= 16);
+                cost = m_me.bufSATD(m_predictions, cuSize);
+                COPY2_IF_LT(acost, cost, lowmode, mode);
+            }
+            if (acost < icost)
+                icost = acost;
         }
-        for (uint32_t dist = 2; dist >= 1; dist--)
-        {
-            mode = lowmode - dist;
-            if (mode < 18)
-                cost = satd(buf_trans, cuSize, &m_predictions[mode * predsize], cuSize);
-            else
-                cost = m_me.bufSATD(&m_predictions[mode * predsize], cuSize);
-            COPY2_IF_LT(acost, cost, lowmode, mode);
-
-            mode = lowmode + dist;
-            if (mode < 18)
-                cost = satd(buf_trans, cuSize, &m_predictions[mode * predsize], cuSize);
-            else
-                cost = m_me.bufSATD(&m_predictions[mode * predsize], cuSize);
-            COPY2_IF_LT(acost, cost, lowmode, mode);
-        }
-        if (acost < icost)
-            icost = acost;
 
         const int intraPenalty = 5 * m_lookAheadLambda;
         icost += intraPenalty + lowresPenalty; /* estimate intra signal cost */


More information about the x265-devel mailing list