[x265] [PATCH 1 of 5] intra: pull the simple 1:2:1 pixel filtering into a performance primitive
Steve Borho
steve at borho.org
Fri Feb 27 21:05:17 CET 2015
# HG changeset patch
# User Steve Borho <steve at borho.org>
# Date 1425064240 21600
# Fri Feb 27 13:10:40 2015 -0600
# Node ID 651e1baa3ecb02e75a08983d5cb8f824371887b6
# Parent 018e8bbaa854b1a4bd82b3a2e23f7775a77da5cc
intra: pull the simple 1:2:1 pixel filtering into a performance primitive
Only C-refs at this point, but at least it is templated so the compiler can
optimize and unroll loops cleanly.
As a side effect, this quiets the gcc 4.8 warning about loop bounds and
aggressive loop optimizations in slicetype.cpp
diff -r 018e8bbaa854 -r 651e1baa3ecb source/common/intrapred.cpp
--- a/source/common/intrapred.cpp Fri Feb 27 11:46:09 2015 +0530
+++ b/source/common/intrapred.cpp Fri Feb 27 13:10:40 2015 -0600
@@ -27,6 +27,29 @@
using namespace x265;
namespace {
+
+template<int tuSize>
+void intraFilter(const pixel* samples, pixel* filtered) /* 1:2:1 filtering of left and top reference samples */
+{
+ const int tuSize2 = tuSize << 1;
+
+ pixel topLeft = samples[0], topLast = samples[tuSize2], leftLast = samples[tuSize2 + tuSize2];
+
+ // filtering top
+ for (int i = 1; i < tuSize2; i++)
+ filtered[i] = ((samples[i] << 1) + samples[i - 1] + samples[i + 1] + 2) >> 2;
+ filtered[tuSize2] = topLast;
+
+ // filtering top-left
+ filtered[0] = ((topLeft << 1) + samples[1] + samples[tuSize2 + 1] + 2) >> 2;
+
+ // filtering left
+ filtered[tuSize2 + 1] = ((samples[tuSize2 + 1] << 1) + topLeft + samples[tuSize2 + 2] + 2) >> 2;
+ for (int i = tuSize2 + 2; i < tuSize2 + tuSize2; i++)
+ filtered[i] = ((samples[i] << 1) + samples[i - 1] + samples[i + 1] + 2) >> 2;
+ filtered[tuSize2 + tuSize2] = leftLast;
+}
+
void dcPredFilter(const pixel* above, const pixel* left, pixel* dst, intptr_t dststride, int size)
{
// boundary pixels processing
@@ -216,6 +239,11 @@
void setupIntraPrimitives_c(EncoderPrimitives& p)
{
+ p.cu[BLOCK_4x4].intra_filter = intraFilter<4>;
+ p.cu[BLOCK_8x8].intra_filter = intraFilter<8>;
+ p.cu[BLOCK_16x16].intra_filter = intraFilter<16>;
+ p.cu[BLOCK_32x32].intra_filter = intraFilter<32>;
+
p.cu[BLOCK_4x4].intra_pred[PLANAR_IDX] = planar_pred_c<2>;
p.cu[BLOCK_8x8].intra_pred[PLANAR_IDX] = planar_pred_c<3>;
p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = planar_pred_c<4>;
diff -r 018e8bbaa854 -r 651e1baa3ecb source/common/predict.cpp
--- a/source/common/predict.cpp Fri Feb 27 11:46:09 2015 +0530
+++ b/source/common/predict.cpp Fri Feb 27 13:10:40 2015 -0600
@@ -93,34 +93,17 @@
void Predict::predIntraChromaAng(uint32_t dirMode, pixel* dst, intptr_t stride, uint32_t log2TrSizeC, int chFmt)
{
int tuSize = 1 << log2TrSizeC;
- int tuSize2 = tuSize << 1;
+ int sizeIdx = log2TrSizeC - 2;
+ X265_CHECK(sizeIdx >= 0 && sizeIdx < 4, "intra block size is out of range\n");
pixel* srcBuf = intraNeighbourBuf[0];
if (chFmt == X265_CSP_I444 && (g_intraFilterFlags[dirMode] & tuSize))
{
- pixel* fltBuf = intraNeighbourBuf[1];
- pixel topLeft = srcBuf[0], topLast = srcBuf[tuSize2], leftLast = srcBuf[tuSize2 + tuSize2];
-
- // filtering top
- for (int i = 1; i < tuSize2; i++)
- fltBuf[i] = ((srcBuf[i] << 1) + srcBuf[i - 1] + srcBuf[i + 1] + 2) >> 2;
- fltBuf[tuSize2] = topLast;
-
- // filtering top-left
- fltBuf[0] = ((srcBuf[0] << 1) + srcBuf[1] + srcBuf[tuSize2 + 1] + 2) >> 2;
-
- // filtering left
- fltBuf[tuSize2 + 1] = ((srcBuf[tuSize2 + 1] << 1) + topLeft + srcBuf[tuSize2 + 2] + 2) >> 2;
- for (int i = tuSize2 + 2; i < tuSize2 + tuSize2; i++)
- fltBuf[i] = ((srcBuf[i] << 1) + srcBuf[i - 1] + srcBuf[i + 1] + 2) >> 2;
- fltBuf[tuSize2 + tuSize2] = leftLast;
-
+ primitives.cu[sizeIdx].intra_filter(intraNeighbourBuf[0], intraNeighbourBuf[1]);
srcBuf = intraNeighbourBuf[1];
}
- int sizeIdx = log2TrSizeC - 2;
- X265_CHECK(sizeIdx >= 0 && sizeIdx < 4, "intra block size is out of range\n");
primitives.cu[sizeIdx].intra_pred[dirMode](dst, stride, srcBuf, dirMode, 0);
}
@@ -626,12 +609,12 @@
}
}
-void Predict::initAdiPattern(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, const IntraNeighbors& intraNeighbors, int dirMode)
+void Predict::initAdiPattern(const CUData& cu, const CUGeom& cuGeom, uint32_t puAbsPartIdx, const IntraNeighbors& intraNeighbors, int dirMode)
{
- int tuSize = intraNeighbors.tuSize;
+ int tuSize = 1 << intraNeighbors.log2TrSize;
int tuSize2 = tuSize << 1;
- pixel* adiOrigin = cu.m_encData->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
+ pixel* adiOrigin = cu.m_encData->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + puAbsPartIdx);
intptr_t picStride = cu.m_encData->m_reconPic->m_stride;
fillReferenceSamples(adiOrigin, picStride, intraNeighbors, intraNeighbourBuf[0]);
@@ -673,25 +656,13 @@
}
}
- // filtering top
- for (int i = 1; i < tuSize2; i++)
- fltBuf[i] = ((refBuf[i] << 1) + refBuf[i - 1] + refBuf[i + 1] + 2) >> 2;
- fltBuf[tuSize2] = topLast;
-
- // filtering top-left
- fltBuf[0] = ((topLeft << 1) + refBuf[1] + refBuf[tuSize2 + 1] + 2) >> 2;
-
- // filtering left
- fltBuf[tuSize2 + 1] = ((refBuf[tuSize2 + 1] << 1) + topLeft + refBuf[tuSize2 + 2] + 2) >> 2;
- for (int i = tuSize2 + 2; i < tuSize2 + tuSize2; i++)
- fltBuf[i] = ((refBuf[i] << 1) + refBuf[i - 1] + refBuf[i + 1] + 2) >> 2;
- fltBuf[tuSize2 + tuSize2] = leftLast;
+ primitives.cu[intraNeighbors.log2TrSize - 2].intra_filter(refBuf, fltBuf);
}
}
-void Predict::initAdiPatternChroma(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, const IntraNeighbors& intraNeighbors, uint32_t chromaId)
+void Predict::initAdiPatternChroma(const CUData& cu, const CUGeom& cuGeom, uint32_t puAbsPartIdx, const IntraNeighbors& intraNeighbors, uint32_t chromaId)
{
- const pixel* adiOrigin = cu.m_encData->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
+ const pixel* adiOrigin = cu.m_encData->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + puAbsPartIdx);
intptr_t picStride = cu.m_encData->m_reconPic->m_strideC;
fillReferenceSamples(adiOrigin, picStride, intraNeighbors, intraNeighbourBuf[0]);
@@ -750,7 +721,7 @@
intraNeighbors->leftUnits = leftUnits;
intraNeighbors->unitWidth = 1 << log2UnitWidth;
intraNeighbors->unitHeight = 1 << log2UnitHeight;
- intraNeighbors->tuSize = tuSize;
+ intraNeighbors->log2TrSize = log2TrSize;
}
void Predict::fillReferenceSamples(const pixel* adiOrigin, intptr_t picStride, const IntraNeighbors& intraNeighbors, pixel dst[258])
@@ -758,7 +729,7 @@
const pixel dcValue = (pixel)(1 << (X265_DEPTH - 1));
int numIntraNeighbor = intraNeighbors.numIntraNeighbor;
int totalUnits = intraNeighbors.totalUnits;
- uint32_t tuSize = intraNeighbors.tuSize;
+ uint32_t tuSize = 1 << intraNeighbors.log2TrSize;
uint32_t refSize = tuSize * 2 + 1;
// Nothing is available, perform DC prediction.
diff -r 018e8bbaa854 -r 651e1baa3ecb source/common/predict.h
--- a/source/common/predict.h Fri Feb 27 11:46:09 2015 +0530
+++ b/source/common/predict.h Fri Feb 27 13:10:40 2015 -0600
@@ -67,7 +67,7 @@
int leftUnits;
int unitWidth;
int unitHeight;
- int tuSize;
+ int log2TrSize;
bool bNeighborFlags[4 * MAX_NUM_SPU_W + 1];
};
@@ -102,8 +102,8 @@
/* Angular Intra */
void predIntraLumaAng(uint32_t dirMode, pixel* pred, intptr_t stride, uint32_t log2TrSize);
void predIntraChromaAng(uint32_t dirMode, pixel* pred, intptr_t stride, uint32_t log2TrSizeC, int chFmt);
- void initAdiPattern(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, const IntraNeighbors& intraNeighbors, int dirMode);
- void initAdiPatternChroma(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, const IntraNeighbors& intraNeighbors, uint32_t chromaId);
+ void initAdiPattern(const CUData& cu, const CUGeom& cuGeom, uint32_t puAbsPartIdx, const IntraNeighbors& intraNeighbors, int dirMode);
+ void initAdiPatternChroma(const CUData& cu, const CUGeom& cuGeom, uint32_t puAbsPartIdx, const IntraNeighbors& intraNeighbors, uint32_t chromaId);
/* Intra prediction helper functions */
static void initIntraNeighbors(const CUData& cu, uint32_t absPartIdx, uint32_t tuDepth, bool isLuma, IntraNeighbors *IntraNeighbors);
diff -r 018e8bbaa854 -r 651e1baa3ecb source/common/primitives.h
--- a/source/common/primitives.h Fri Feb 27 11:46:09 2015 +0530
+++ b/source/common/primitives.h Fri Feb 27 13:10:40 2015 -0600
@@ -119,6 +119,7 @@
typedef void (*intra_pred_t)(pixel* dst, intptr_t dstStride, const pixel *srcPix, int dirMode, int bFilter);
typedef void (*intra_allangs_t)(pixel *dst, pixel *refPix, pixel *filtPix, int bLuma);
+typedef void (*intra_filter_t)(const pixel* references, pixel* filtered);
typedef void (*cpy2Dto1D_shl_t)(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
typedef void (*cpy2Dto1D_shr_t)(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
@@ -247,6 +248,7 @@
transpose_t transpose; // transpose pixel block; for use with intra all-angs
intra_allangs_t intra_pred_allangs;
+ intra_filter_t intra_filter;
intra_pred_t intra_pred[NUM_INTRA_MODE];
}
cu[NUM_CU_SIZES];
diff -r 018e8bbaa854 -r 651e1baa3ecb source/encoder/slicetype.cpp
--- a/source/encoder/slicetype.cpp Fri Feb 27 11:46:09 2015 +0530
+++ b/source/encoder/slicetype.cpp Fri Feb 27 13:10:40 2015 -0600
@@ -212,6 +212,7 @@
ALIGN_VAR_32(pixel, prediction[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
pixel fencIntra[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE];
pixel neighbours[2][X265_LOWRES_CU_SIZE * 4 + 1];
+ pixel* samples = neighbours[0], *filtered = neighbours[1];
const int lookAheadLambda = (int)x265_lambda_tab[X265_LOOKAHEAD_QP];
const int intraPenalty = 5 * lookAheadLambda;
@@ -221,8 +222,8 @@
const int cuSize2 = cuSize << 1;
const int sizeIdx = X265_LOWRES_CU_BITS - 2;
- pixel *planar = (cuSize >= 8) ? neighbours[1] : neighbours[0];
pixelcmp_t satd = primitives.pu[sizeIdx].satd;
+ int planar = !!(cuSize >= 8);
fenc.costEst[0][0] = 0;
fenc.costEstAq[0][0] = 0;
@@ -235,43 +236,33 @@
{
const int cuXY = cuX + cuY * widthInCU;
const intptr_t pelOffset = cuSize * cuX + cuSize * cuY * fenc.lumaStride;
+ pixel *pixCur = fenc.lowresPlane[0] + pelOffset;
- /* Prep reference pixels */
- pixel *pixCur = fenc.lowresPlane[0] + pelOffset;
+ /* copy fenc pixels */
primitives.cu[sizeIdx].copy_pp(fencIntra, cuSize, pixCur, fenc.lumaStride);
- memcpy(neighbours[0], pixCur - 1 - fenc.lumaStride, (cuSize + 1) * sizeof(pixel));
+ /* collect reference sample pixels */
+ memcpy(samples, pixCur - 1 - fenc.lumaStride, (cuSize + 1) * sizeof(pixel));
for (int i = 1; i < cuSize + 1; i++)
- neighbours[0][i + cuSize2] = pixCur[(i - 1) * fenc.lumaStride - 1];
+ samples[i + cuSize2] = pixCur[(i - 1) * fenc.lumaStride - 1];
for (int i = 0; i < cuSize; i++)
{
- neighbours[0][i + cuSize + 1] = neighbours[0][cuSize]; // Copy above-last pixel
- neighbours[0][i + cuSize2 + cuSize + 1] = neighbours[0][cuSize2 + cuSize]; // Copy left-last pixel
+ samples[i + cuSize + 1] = samples[cuSize]; // Copy above-last pixel
+ samples[i + cuSize2 + cuSize + 1] = samples[cuSize2 + cuSize]; // Copy left-last pixel
}
- neighbours[1][0] = neighbours[0][0]; // Copy top-left pixel
- neighbours[1][cuSize2] = neighbours[0][cuSize2]; // Copy top-right pixel
- neighbours[1][cuSize2 << 1] = neighbours[0][cuSize2 << 1]; // Bottom-left pixel
-
- // Filter neighbour pixels with [1-2-1]
- neighbours[1][1] = (neighbours[0][0] + (neighbours[0][1] << 1) + neighbours[0][2] + 2) >> 2;
- neighbours[1][cuSize2 + 1] = (neighbours[0][0] + (neighbours[0][cuSize2 + 1] << 1) + neighbours[0][cuSize2 + 1 + 1] + 2) >> 2;
- for (int i = 2; i < cuSize2; i++)
- {
- neighbours[1][i] = (neighbours[0][i - 1] + (neighbours[0][i] << 1) + neighbours[0][i + 1] + 2) >> 2;
- neighbours[1][cuSize2 + i] = (neighbours[0][cuSize2 + i - 1] + (neighbours[0][cuSize2 + i] << 1) + neighbours[0][cuSize2 + i + 1] + 2) >> 2;
- }
+ primitives.cu[sizeIdx].intra_filter(samples, filtered);
int cost, icost = me.COST_MAX;
uint32_t ilowmode = 0;
/* DC and planar */
- primitives.cu[sizeIdx].intra_pred[DC_IDX](prediction, cuSize, neighbours[0], 0, cuSize <= 16);
+ primitives.cu[sizeIdx].intra_pred[DC_IDX](prediction, cuSize, samples, 0, cuSize <= 16);
cost = satd(fencIntra, cuSize, prediction, cuSize);
COPY2_IF_LT(icost, cost, ilowmode, DC_IDX);
- primitives.cu[sizeIdx].intra_pred[PLANAR_IDX](prediction, cuSize, planar, 0, 0);
+ primitives.cu[sizeIdx].intra_pred[PLANAR_IDX](prediction, cuSize, neighbours[planar], 0, 0);
cost = satd(fencIntra, cuSize, prediction, cuSize);
COPY2_IF_LT(icost, cost, ilowmode, PLANAR_IDX);
More information about the x265-devel
mailing list