[x265] [PATCH 1 of 2] intra: do not use C references for intra all-angs
Steve Borho
steve at borho.org
Thu Jan 8 05:59:24 CET 2015
# HG changeset patch
# User Steve Borho <steve at borho.org>
# Date 1420689537 -19800
# Thu Jan 08 09:28:57 2015 +0530
# Node ID ad1beedf5dd689e1d59d5d41e8e9b53d87d4c75b
# Parent c4df42f39c2e796ce19b8c3b73969af0230609a1
intra: do not use C references for intra all-angs
For platforms which do not have SIMD optimized all-angs functions, it is much
better to call the single angular functions as-needed. The C reference for this
primitive is trying to match the SIMD outputs, and does many redundant
transposes.
diff -r c4df42f39c2e -r ad1beedf5dd6 source/common/predict.cpp
--- a/source/common/predict.cpp Wed Jan 07 18:01:59 2015 +0530
+++ b/source/common/predict.cpp Thu Jan 08 09:28:57 2015 +0530
@@ -86,16 +86,16 @@
pixel* refLft;
pixel* refAbv;
- if (!(g_intraFilterFlags[dirMode] & tuSize))
+ if (g_intraFilterFlags[dirMode] & tuSize)
+ {
+ refLft = m_refLeftFlt + tuSize - 1;
+ refAbv = m_refAboveFlt + tuSize - 1;
+ }
+ else
{
refLft = m_refLeft + tuSize - 1;
refAbv = m_refAbove + tuSize - 1;
}
- else
- {
- refLft = m_refLeftFlt + tuSize - 1;
- refAbv = m_refAboveFlt + tuSize - 1;
- }
bool bFilter = log2TrSize <= 4;
int sizeIdx = log2TrSize - 2;
diff -r c4df42f39c2e -r ad1beedf5dd6 source/common/primitives.cpp
--- a/source/common/primitives.cpp Wed Jan 07 18:01:59 2015 +0530
+++ b/source/common/primitives.cpp Thu Jan 08 09:28:57 2015 +0530
@@ -162,6 +162,12 @@
{
Setup_C_Primitives(primitives);
+ /* We do not want the encoder to use the un-optimized intra all-angles
+ * C references. It is better to call the individual angle functions
+ * instead. We must check for NULL before using this primitive */
+ for (int i = 0; i < NUM_TR_SIZE; i++)
+ primitives.intra_pred_allangs[i] = NULL;
+
#if ENABLE_ASSEMBLY
Setup_Instrinsic_Primitives(primitives, cpuid);
Setup_Assembly_Primitives(primitives, cpuid);
diff -r c4df42f39c2e -r ad1beedf5dd6 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Jan 07 18:01:59 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Thu Jan 08 09:28:57 2015 +0530
@@ -1246,32 +1246,6 @@
namespace x265 {
// private x265 namespace
-#if HIGH_BIT_DEPTH
-/* Very similar to CRef in intrapred.cpp, except it uses optimized primitives */
-template<int log2Size>
-void intra_allangs(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, int bLuma)
-{
- const int size = 1 << log2Size;
- const int sizeIdx = log2Size - 2;
- ALIGN_VAR_32(pixel, buffer[32 * 32]);
-
- for (int mode = 2; mode <= 34; mode++)
- {
- pixel *left = (g_intraFilterFlags[mode] & size ? left1 : left0);
- pixel *above = (g_intraFilterFlags[mode] & size ? above1 : above0);
- pixel *out = dest + ((mode - 2) << (log2Size * 2));
-
- if (mode < 18)
- {
- primitives.intra_pred[mode][sizeIdx](buffer, size, left, above, mode, bLuma);
- primitives.transpose[sizeIdx](out, buffer, size);
- }
- else
- primitives.intra_pred[mode][sizeIdx](out, size, left, above, mode, bLuma);
- }
-}
-#endif
-
void Setup_Assembly_Primitives(EncoderPrimitives &p, int cpuMask)
{
#if HIGH_BIT_DEPTH
@@ -1521,14 +1495,6 @@
p.chroma[X265_CSP_I422].copy_pp[i] = (copy_pp_t)p.chroma[X265_CSP_I422].copy_ss[i];
}
- if (p.intra_pred[0][0] && p.transpose[0])
- {
- p.intra_pred_allangs[BLOCK_4x4] = intra_allangs<2>;
- p.intra_pred_allangs[BLOCK_8x8] = intra_allangs<3>;
- p.intra_pred_allangs[BLOCK_16x16] = intra_allangs<4>;
- p.intra_pred_allangs[BLOCK_32x32] = intra_allangs<5>;
- }
-
#else // if HIGH_BIT_DEPTH
if (cpuMask & X265_CPU_SSE2)
{
diff -r c4df42f39c2e -r ad1beedf5dd6 source/encoder/search.cpp
--- a/source/encoder/search.cpp Wed Jan 07 18:01:59 2015 +0530
+++ b/source/encoder/search.cpp Thu Jan 08 09:28:57 2015 +0530
@@ -1290,22 +1290,36 @@
cost = m_rdCost.calcRdSADCost(sad, bits);
COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits);
- // Transpose NxN
- primitives.transpose[sizeIdx](bufTrans, fenc, scaleStride);
-
- primitives.intra_pred_allangs[sizeIdx](tmp, above, left, aboveFiltered, leftFiltered, (scaleTuSize <= 16));
+ bool allangs = true;
+ if (primitives.intra_pred_allangs[sizeIdx])
+ {
+ primitives.transpose[sizeIdx](bufTrans, fenc, scaleStride);
+ primitives.intra_pred_allangs[sizeIdx](tmp, above, left, aboveFiltered, leftFiltered, (scaleTuSize <= 16));
+ }
+ else
+ allangs = false;
bool modeHor;
const pixel* cmp;
intptr_t srcStride;
#define TRY_ANGLE(angle) \
- modeHor = angle < 18; \
- cmp = modeHor ? bufTrans : fenc; \
- srcStride = modeHor ? scaleTuSize : scaleStride; \
- sad = sa8d(cmp, srcStride, &tmp[(angle - 2) * predsize], scaleTuSize) << costShift; \
- bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(preds, angle) : rbits; \
- cost = m_rdCost.calcRdSADCost(sad, bits)
+ if (allangs) { \
+ modeHor = angle < 18; \
+ cmp = modeHor ? bufTrans : fenc; \
+ srcStride = modeHor ? scaleTuSize : scaleStride; \
+ sad = sa8d(cmp, srcStride, &tmp[(angle - 2) * predsize], scaleTuSize) << costShift; \
+ bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(preds, angle) : rbits; \
+ cost = m_rdCost.calcRdSADCost(sad, bits); \
+ } else { \
+ if (g_intraFilterFlags[angle] & scaleTuSize) \
+ primitives.intra_pred[angle][sizeIdx](tmp, scaleTuSize, leftFiltered, aboveFiltered, mode, scaleTuSize <= 16); \
+ else \
+ primitives.intra_pred[angle][sizeIdx](tmp, scaleTuSize, left, above, mode, scaleTuSize <= 16); \
+ sad = sa8d(cmp, srcStride, tmp, scaleTuSize) << costShift; \
+ bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(preds, angle) : rbits; \
+ cost = m_rdCost.calcRdSADCost(sad, bits); \
+ }
if (m_param->bEnableFastIntra)
{
@@ -1520,18 +1534,34 @@
COPY1_IF_LT(bcost, modeCosts[PLANAR_IDX]);
// angular predictions
- primitives.intra_pred_allangs[sizeIdx](tmp, above, left, aboveFiltered, leftFiltered, (scaleTuSize <= 16));
-
- primitives.transpose[sizeIdx](buf_trans, fenc, scaleStride);
- for (int mode = 2; mode < 35; mode++)
+ if (primitives.intra_pred_allangs[sizeIdx])
{
- bool modeHor = (mode < 18);
- const pixel* cmp = (modeHor ? buf_trans : fenc);
- intptr_t srcStride = (modeHor ? scaleTuSize : scaleStride);
- bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits;
- sad = sa8d(cmp, srcStride, &tmp[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift;
- modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits);
- COPY1_IF_LT(bcost, modeCosts[mode]);
+ primitives.intra_pred_allangs[sizeIdx](tmp, above, left, aboveFiltered, leftFiltered, (scaleTuSize <= 16));
+ primitives.transpose[sizeIdx](buf_trans, fenc, scaleStride);
+ for (int mode = 2; mode < 35; mode++)
+ {
+ bool modeHor = (mode < 18);
+ const pixel* cmp = (modeHor ? buf_trans : fenc);
+ intptr_t srcStride = (modeHor ? scaleTuSize : scaleStride);
+ bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits;
+ sad = sa8d(cmp, srcStride, &tmp[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift;
+ modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits);
+ COPY1_IF_LT(bcost, modeCosts[mode]);
+ }
+ }
+ else
+ {
+ for (int mode = 2; mode < 35; mode++)
+ {
+ if (g_intraFilterFlags[mode] & scaleTuSize)
+ primitives.intra_pred[mode][sizeIdx](tmp, scaleTuSize, leftFiltered, aboveFiltered, mode, scaleTuSize <= 16);
+ else
+ primitives.intra_pred[mode][sizeIdx](tmp, scaleTuSize, left, above, mode, scaleTuSize <= 16);
+ bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits;
+ sad = sa8d(fenc, scaleStride, tmp, scaleTuSize) << costShift;
+ modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits);
+ COPY1_IF_LT(bcost, modeCosts[mode]);
+ }
}
/* Find the top maxCandCount candidate modes with cost within 25% of best
diff -r c4df42f39c2e -r ad1beedf5dd6 source/encoder/slicetype.cpp
--- a/source/encoder/slicetype.cpp Wed Jan 07 18:01:59 2015 +0530
+++ b/source/encoder/slicetype.cpp Thu Jan 08 09:28:57 2015 +0530
@@ -1713,40 +1713,76 @@
cost = m_me.bufSATD(m_predictions, cuSize);
if (cost < icost)
icost = cost;
- primitives.intra_pred_allangs[sizeIdx](m_predictions + 2 * predsize, above0, left0, above1, left1, (cuSize <= 16));
- // calculate satd costs, keep least cost
- ALIGN_VAR_32(pixel, buf_trans[32 * 32]);
- primitives.transpose[sizeIdx](buf_trans, m_me.fencPUYuv.m_buf[0], FENC_STRIDE);
+ uint32_t mode, lowmode = 4;
+ if (primitives.intra_pred_allangs[sizeIdx])
+ {
+ ALIGN_VAR_32(pixel, buf_trans[32 * 32]);
- int acost = m_me.COST_MAX;
- uint32_t mode, lowmode = 4;
- for (mode = 5; mode < 35; mode += 5)
+ primitives.intra_pred_allangs[sizeIdx](m_predictions + 2 * predsize, above0, left0, above1, left1, (cuSize <= 16));
+ primitives.transpose[sizeIdx](buf_trans, m_me.fencPUYuv.m_buf[0], FENC_STRIDE);
+
+ int acost = m_me.COST_MAX;
+ for (mode = 5; mode < 35; mode += 5)
+ {
+ if (mode < 18)
+ cost = satd(buf_trans, cuSize, &m_predictions[mode * predsize], cuSize);
+ else
+ cost = m_me.bufSATD(&m_predictions[mode * predsize], cuSize);
+ COPY2_IF_LT(acost, cost, lowmode, mode);
+ }
+ for (uint32_t dist = 2; dist >= 1; dist--)
+ {
+ mode = lowmode - dist;
+ if (mode < 18)
+ cost = satd(buf_trans, cuSize, &m_predictions[mode * predsize], cuSize);
+ else
+ cost = m_me.bufSATD(&m_predictions[mode * predsize], cuSize);
+ COPY2_IF_LT(acost, cost, lowmode, mode);
+
+ mode = lowmode + dist;
+ if (mode < 18)
+ cost = satd(buf_trans, cuSize, &m_predictions[mode * predsize], cuSize);
+ else
+ cost = m_me.bufSATD(&m_predictions[mode * predsize], cuSize);
+ COPY2_IF_LT(acost, cost, lowmode, mode);
+ }
+ if (acost < icost)
+ icost = acost;
+ }
+ else
{
- if (mode < 18)
- cost = satd(buf_trans, cuSize, &m_predictions[mode * predsize], cuSize);
- else
- cost = m_me.bufSATD(&m_predictions[mode * predsize], cuSize);
- COPY2_IF_LT(acost, cost, lowmode, mode);
+ int acost = m_me.COST_MAX;
+ for (mode = 5; mode < 35; mode += 5)
+ {
+ if (g_intraFilterFlags[mode] & cuSize)
+ primitives.intra_pred[mode][sizeIdx](m_predictions, cuSize, left1, above1, mode, cuSize <= 16);
+ else
+ primitives.intra_pred[mode][sizeIdx](m_predictions, cuSize, left0, above0, mode, cuSize <= 16);
+ cost = m_me.bufSATD(m_predictions, cuSize);
+ COPY2_IF_LT(acost, cost, lowmode, mode);
+ }
+ for (uint32_t dist = 2; dist >= 1; dist--)
+ {
+ mode = lowmode - dist;
+ if (g_intraFilterFlags[mode] & cuSize)
+ primitives.intra_pred[mode][sizeIdx](m_predictions, cuSize, left1, above1, mode, cuSize <= 16);
+ else
+ primitives.intra_pred[mode][sizeIdx](m_predictions, cuSize, left0, above0, mode, cuSize <= 16);
+ cost = m_me.bufSATD(m_predictions, cuSize);
+ COPY2_IF_LT(acost, cost, lowmode, mode);
+
+ mode = lowmode + dist;
+ if (g_intraFilterFlags[mode] & cuSize)
+ primitives.intra_pred[mode][sizeIdx](m_predictions, cuSize, left1, above1, mode, cuSize <= 16);
+ else
+ primitives.intra_pred[mode][sizeIdx](m_predictions, cuSize, left0, above0, mode, cuSize <= 16);
+ cost = m_me.bufSATD(m_predictions, cuSize);
+ COPY2_IF_LT(acost, cost, lowmode, mode);
+ }
+ if (acost < icost)
+ icost = acost;
}
- for (uint32_t dist = 2; dist >= 1; dist--)
- {
- mode = lowmode - dist;
- if (mode < 18)
- cost = satd(buf_trans, cuSize, &m_predictions[mode * predsize], cuSize);
- else
- cost = m_me.bufSATD(&m_predictions[mode * predsize], cuSize);
- COPY2_IF_LT(acost, cost, lowmode, mode);
-
- mode = lowmode + dist;
- if (mode < 18)
- cost = satd(buf_trans, cuSize, &m_predictions[mode * predsize], cuSize);
- else
- cost = m_me.bufSATD(&m_predictions[mode * predsize], cuSize);
- COPY2_IF_LT(acost, cost, lowmode, mode);
- }
- if (acost < icost)
- icost = acost;
const int intraPenalty = 5 * m_lookAheadLambda;
icost += intraPenalty + lowresPenalty; /* estimate intra signal cost */
More information about the x265-devel
mailing list