[x265] [PATCH] intra refactoring: intra prediction with modified planar, angular prediction functions
ashok at multicorewareinc.com
ashok at multicorewareinc.com
Fri Nov 28 16:35:37 CET 2014
# HG changeset patch
# User Ashok Kumar Mishra<ashok at multicorewareinc.com>
# Date 1417187632 -19800
# Fri Nov 28 20:43:52 2014 +0530
# Node ID 0b87552d51f96e5c7376bf3b8b57951b83a8f4f2
# Parent b6146d08a1f1c78246c2650e532db8cefdbc3bfb
intra refactoring: intra prediction with modified planar, angular prediction functions
Buffer used for intra prediction(around 2 * (1 + 2 * 128) bytes) is reduced from (1152 * 1056 + 768 bytes, roughly calculated).
Planar and angular prediction functions are modified. This patch is used modified C functions for all intra prediction finctions.
It may affect the overall performance. So suggesting to map all modified C functions to their corresponding asm functions.
diff -r b6146d08a1f1 -r 0b87552d51f9 source/common/intrapred.cpp
--- a/source/common/intrapred.cpp Tue Nov 25 13:04:58 2014 +0530
+++ b/source/common/intrapred.cpp Fri Nov 28 20:43:52 2014 +0530
@@ -86,6 +86,25 @@
}
}
+template<int width>
+void intra_pred_dc_c_new(pixel* dst, intptr_t dstStride, pixel* srcPix, int /*dirMode*/, int bFilter)
+{
+ int k, l;
+
+ int dcVal = width;
+ for (int i = 0; i < width; i++)
+ dcVal += srcPix[1 + i] + srcPix[2 * width + 1 + i];
+
+ dcVal = dcVal / (width + width);
+ for (k = 0; k < width; k++)
+ for (l = 0; l < width; l++)
+ dst[k * dstStride + l] = (pixel)dcVal;
+
+ if (bFilter)
+ dcPredFilter(srcPix + 1, srcPix + (2 * width + 1), dst, dstStride, width);
+
+}
+
template<int log2Size>
void planar_pred_c(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int /*dirMode*/, int /*bFilter*/)
{
@@ -133,6 +152,21 @@
}
}
+template<int log2Size>
+void planar_pred_c_new(pixel* dst, intptr_t dstStride, pixel* srcPix, int /*dirMode*/, int /*bFilter*/)
+{
+ const int blkSize = 1 << log2Size;
+
+ pixel* above = srcPix + 1;
+ pixel* left = srcPix + (2 * blkSize + 1);
+
+ pixel topRight = above[blkSize];
+ pixel bottomLeft = left[blkSize];
+ for (int y = 0; y < blkSize; y++)
+ for (int x = 0; x < blkSize; x++)
+ dst[y * dstStride + x] = (pixel) (((blkSize - 1 - x) * left[y] + (blkSize - 1 -y) * above[x] + (x + 1) * topRight + (y + 1) * bottomLeft + blkSize) >> (log2Size + 1));
+}
+
template<int width>
void intra_pred_ang_c(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
{
@@ -140,7 +174,7 @@
int k, l;
bool modeHor = (dirMode < 18);
bool modeVer = !modeHor;
- int intraPredAngle = modeVer ? (int)dirMode - VER_IDX : modeHor ? -((int)dirMode - HOR_IDX) : 0;
+ int intraPredAngle = modeVer ? (int)dirMode - VER_IDX : modeHor ? - ((int)dirMode - HOR_IDX) : 0;
int absAng = abs(intraPredAngle);
int signAng = intraPredAngle < 0 ? -1 : 1;
@@ -244,6 +278,107 @@
}
}
+template<int width>
+void intra_pred_ang_c_new(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
+{
+ int width2 = width << 1;
+ // Flip the neighbours in the horizontal case.
+ int horMode = dirMode < 18;
+ pixel neighbourBuf[129];
+
+ if (horMode)
+ {
+ neighbourBuf[0] = srcPix[0];
+ for (int i = 0; i < width << 1; i++)
+ {
+ neighbourBuf[1 + i] = srcPix[width2 + 1 + i];
+ neighbourBuf[width2 + 1 + i] = srcPix[1 + i];
+ }
+ srcPix = neighbourBuf;
+ }
+
+ // Intra prediction angle and inverse angle tables.
+ const int8_t angleTable[17] = { -32, -26, -21, -17, -13, -9, -5, -2, 0, 2, 5, 9, 13, 17, 21, 26, 32 };
+ const int16_t invAngleTable[8] = { 4096, 1638, 910, 630, 482, 390, 315, 256 };
+
+ // Get the prediction angle.
+ int angleOffset = horMode ? 10 - dirMode : dirMode - 26;
+ int angle = angleTable[8 + angleOffset];
+
+ // Vertical Prediction.
+ if (!angle)
+ {
+ for (int y = 0; y < width; y++)
+ for (int x = 0; x < width; x++)
+ dst[y * dstStride + x] = srcPix[1 + x];
+
+ if (bFilter)
+ {
+ int topLeft = srcPix[0], top = srcPix[1];
+ for (int y = 0; y < width; y++)
+ dst[y * dstStride] = (pixel)Clip3((int16_t)0, (int16_t)((1 << X265_DEPTH) - 1), static_cast<int16_t>(top + ((srcPix[width2 + 1 + y] - topLeft) >> 1)));
+ }
+ }
+ else // Angular prediction.
+ {
+ // Get the reference pixels. The reference base is the first pixel to the top (neighbourBuf[1]).
+ pixel refBuf[64], *ref;
+
+ // Use the projected left neighbours and the top neighbours.
+ if (angle < 0)
+ {
+ // Number of neighbours projected.
+ int nbProjected = -((width * angle) >> 5) - 1;
+ ref = refBuf + nbProjected + 1;
+
+ // Project the neighbours.
+ int invAngle = invAngleTable[- angleOffset - 1];
+ int invAngleSum = 128;
+ for (int i = 0; i < nbProjected; i++)
+ {
+ invAngleSum += invAngle;
+ ref[- 2 - i] = srcPix[width2 + (invAngleSum >> 8)];
+ }
+
+ // Copy the top-left and top pixels.
+ for (int i = 0; i < width + 1; i++)
+ ref[-1 + i] = srcPix[i];
+ }
+ else // Use the top and top-right neighbours.
+ ref = srcPix + 1;
+
+ // Pass every row.
+ int angleSum = 0;
+ for (int y = 0; y < width; y++)
+ {
+ angleSum += angle;
+ int offset = angleSum >> 5;
+ int fraction = angleSum & 31;
+
+ if (fraction) // Interpolate
+ for (int x = 0; x < width; x++)
+ dst[y * dstStride + x] = (pixel)(((32 - fraction) * ref[offset + x] + fraction * ref[offset + x + 1] + 16) >> 5);
+ else // Copy.
+ for (int x = 0; x < width; x++)
+ dst[y * dstStride + x] = ref[offset + x];
+ }
+ }
+
+ // Flip for horizontal.
+ if (horMode)
+ {
+ for (int y = 0; y < width - 1; y++)
+ {
+ for (int x = y + 1; x < width; x++)
+ {
+ pixel tmp = dst[y * dstStride + x];
+ dst[y * dstStride + x] = dst[x * dstStride + y];
+ dst[x * dstStride + y] = tmp;
+ }
+ }
+ }
+}
+
template<int log2Size>
void all_angs_pred_c(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, int bLuma)
{
@@ -274,6 +409,36 @@
}
}
}
+
+template<int log2Size>
+void all_angs_pred_c_new(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
+{
+ const int size = 1 << log2Size;
+ for (int mode = 2; mode <= 34; mode++)
+ {
+ pixel *srcPix = (g_intraFilterFlags[mode] & size ? filtPix : refPix);
+ pixel *out = dest + ((mode - 2) << (log2Size * 2));
+
+ intra_pred_ang_c_new<size>(out, size, srcPix, mode, bLuma);
+
+ // Optimize code don't flip buffer
+ bool modeHor = (mode < 18);
+
+ // transpose the block if this is a horizontal mode
+ if (modeHor)
+ {
+ for (int k = 0; k < size - 1; k++)
+ {
+ for (int l = k + 1; l < size; l++)
+ {
+ pixel tmp = out[k * size + l];
+ out[k * size + l] = out[l * size + k];
+ out[l * size + k] = tmp;
+ }
+ }
+ }
+ }
+}
}
namespace x265 {
@@ -286,22 +451,43 @@
p.intra_pred[0][BLOCK_16x16] = planar_pred_c<4>;
p.intra_pred[0][BLOCK_32x32] = planar_pred_c<5>;
+ p.intra_pred_new[0][BLOCK_4x4] = planar_pred_c_new<2>;
+ p.intra_pred_new[0][BLOCK_8x8] = planar_pred_c_new<3>;
+ p.intra_pred_new[0][BLOCK_16x16] = planar_pred_c_new<4>;
+ p.intra_pred_new[0][BLOCK_32x32] = planar_pred_c_new<5>;
+
// Intra Prediction DC
p.intra_pred[1][BLOCK_4x4] = intra_pred_dc_c<4>;
p.intra_pred[1][BLOCK_8x8] = intra_pred_dc_c<8>;
p.intra_pred[1][BLOCK_16x16] = intra_pred_dc_c<16>;
p.intra_pred[1][BLOCK_32x32] = intra_pred_dc_c<32>;
+
+ p.intra_pred_new[1][BLOCK_4x4] = intra_pred_dc_c_new<4>;
+ p.intra_pred_new[1][BLOCK_8x8] = intra_pred_dc_c_new<8>;
+ p.intra_pred_new[1][BLOCK_16x16] = intra_pred_dc_c_new<16>;
+ p.intra_pred_new[1][BLOCK_32x32] = intra_pred_dc_c_new<32>;
+
for (int i = 2; i < NUM_INTRA_MODE; i++)
{
p.intra_pred[i][BLOCK_4x4] = intra_pred_ang_c<4>;
p.intra_pred[i][BLOCK_8x8] = intra_pred_ang_c<8>;
p.intra_pred[i][BLOCK_16x16] = intra_pred_ang_c<16>;
p.intra_pred[i][BLOCK_32x32] = intra_pred_ang_c<32>;
+
+ p.intra_pred_new[i][BLOCK_4x4] = intra_pred_ang_c_new<4>;
+ p.intra_pred_new[i][BLOCK_8x8] = intra_pred_ang_c_new<8>;
+ p.intra_pred_new[i][BLOCK_16x16] = intra_pred_ang_c_new<16>;
+ p.intra_pred_new[i][BLOCK_32x32] = intra_pred_ang_c_new<32>;
}
p.intra_pred_allangs[BLOCK_4x4] = all_angs_pred_c<2>;
p.intra_pred_allangs[BLOCK_8x8] = all_angs_pred_c<3>;
p.intra_pred_allangs[BLOCK_16x16] = all_angs_pred_c<4>;
p.intra_pred_allangs[BLOCK_32x32] = all_angs_pred_c<5>;
+
+ p.intra_pred_allangs_new[BLOCK_4x4] = all_angs_pred_c_new<2>;
+ p.intra_pred_allangs_new[BLOCK_8x8] = all_angs_pred_c_new<3>;
+ p.intra_pred_allangs_new[BLOCK_16x16] = all_angs_pred_c_new<4>;
+ p.intra_pred_allangs_new[BLOCK_32x32] = all_angs_pred_c_new<5>;
}
}
diff -r b6146d08a1f1 -r 0b87552d51f9 source/common/pixel.cpp
--- a/source/common/pixel.cpp Tue Nov 25 13:04:58 2014 +0530
+++ b/source/common/pixel.cpp Fri Nov 28 20:43:52 2014 +0530
@@ -660,6 +660,31 @@
}
}
+void scale1D_128to64_new(pixel *dst, const pixel *src, intptr_t /*stride*/)
+{
+ int x;
+ const pixel* src1 = src;
+ const pixel* src2 = src + 128;
+
+ pixel* dst1 = dst;
+ pixel* dst2 = dst + 64/*128*/;
+
+ for (x = 0; x < 128; x += 2)
+ {
+ // Top pixel
+ pixel pix0 = src1[(x + 0)];
+ pixel pix1 = src1[(x + 1)];
+
+ // Left pixel
+ pixel pix2 = src2[(x + 0)];
+ pixel pix3 = src2[(x + 1)];
+ int sum1 = pix0 + pix1;
+ int sum2 = pix2 + pix3;
+
+ dst1[x >> 1] = (pixel)((sum1 + 1) >> 1);
+ dst2[x >> 1] = (pixel)((sum2 + 1) >> 1);
+ }
+}
void scale2D_64to32(pixel* dst, const pixel* src, intptr_t stride)
{
uint32_t x, y;
@@ -1354,6 +1379,7 @@
p.weight_sp = weight_sp_c;
p.scale1D_128to64 = scale1D_128to64;
+ p.scale1D_128to64_new = scale1D_128to64_new;
p.scale2D_64to32 = scale2D_64to32;
p.frame_init_lowres_core = frame_init_lowres_core;
p.ssim_4x4x2_core = ssim_4x4x2_core;
diff -r b6146d08a1f1 -r 0b87552d51f9 source/common/predict.cpp
--- a/source/common/predict.cpp Tue Nov 25 13:04:58 2014 +0530
+++ b/source/common/predict.cpp Fri Nov 28 20:43:52 2014 +0530
@@ -40,18 +40,11 @@
Predict::Predict()
{
- m_predBuf = NULL;
- m_refAbove = NULL;
- m_refAboveFlt = NULL;
- m_refLeft = NULL;
- m_refLeftFlt = NULL;
m_immedVals = NULL;
}
Predict::~Predict()
{
- X265_FREE(m_predBuf);
- X265_FREE(m_refAbove);
X265_FREE(m_immedVals);
m_predShortYuv[0].destroy();
m_predShortYuv[1].destroy();
@@ -63,15 +56,7 @@
m_hChromaShift = CHROMA_H_SHIFT(csp);
m_vChromaShift = CHROMA_V_SHIFT(csp);
- int predBufHeight = ((MAX_CU_SIZE + 2) << 4);
- int predBufStride = ((MAX_CU_SIZE + 8) << 4);
- CHECKED_MALLOC(m_predBuf, pixel, predBufStride * predBufHeight);
CHECKED_MALLOC(m_immedVals, int16_t, 64 * (64 + NTAPS_LUMA - 1));
- CHECKED_MALLOC(m_refAbove, pixel, 12 * MAX_CU_SIZE);
-
- m_refAboveFlt = m_refAbove + 3 * MAX_CU_SIZE;
- m_refLeft = m_refAboveFlt + 3 * MAX_CU_SIZE;
- m_refLeftFlt = m_refLeft + 3 * MAX_CU_SIZE;
return m_predShortYuv[0].create(MAX_CU_SIZE, csp) && m_predShortYuv[1].create(MAX_CU_SIZE, csp);
@@ -82,68 +67,53 @@
void Predict::predIntraLumaAng(uint32_t dirMode, pixel* dst, intptr_t stride, uint32_t log2TrSize)
{
int tuSize = 1 << log2TrSize;
-
- pixel *refLft, *refAbv;
+ pixel* srcPix;
if (!(g_intraFilterFlags[dirMode] & tuSize))
- {
- refLft = m_refLeft + tuSize - 1;
- refAbv = m_refAbove + tuSize - 1;
- }
+ srcPix = intraNeighbours[0];
else
- {
- refLft = m_refLeftFlt + tuSize - 1;
- refAbv = m_refAboveFlt + tuSize - 1;
- }
+ srcPix = intraNeighbours[1];
bool bFilter = log2TrSize <= 4;
int sizeIdx = log2TrSize - 2;
X265_CHECK(sizeIdx >= 0 && sizeIdx < 4, "intra block size is out of range\n");
- primitives.intra_pred[dirMode][sizeIdx](dst, stride, refLft, refAbv, dirMode, bFilter);
+// primitives.intra_pred[dirMode][sizeIdx](dst, stride, refLft, refAbv, dirMode, bFilter);
+ primitives.intra_pred_new[dirMode][sizeIdx](dst, stride, srcPix, dirMode, bFilter);
}
-void Predict::predIntraChromaAng(pixel* src, uint32_t dirMode, pixel* dst, intptr_t stride, uint32_t log2TrSizeC, int chFmt)
+void Predict::predIntraChromaAng(uint32_t dirMode, pixel* dst, intptr_t stride, uint32_t log2TrSizeC, int chFmt)
{
int tuSize = 1 << log2TrSizeC;
int tuSize2 = tuSize << 1;
- // Create the prediction
- const int bufOffset = tuSize - 1;
- pixel buf0[3 * MAX_CU_SIZE];
- pixel buf1[3 * MAX_CU_SIZE];
- pixel* above;
- pixel* left = buf0 + bufOffset;
-
- int limit = (dirMode <= 25 && dirMode >= 11) ? (tuSize + 1 + 1) : (tuSize2 + 1);
-
- left[0] = src[0];
- for (int k = 1; k < limit; k++)
- left[k] = src[k + tuSize2];
+ pixel* srcBuf = intraNeighbours[0];
if (chFmt == X265_CSP_I444 && (g_intraFilterFlags[dirMode] & tuSize))
{
- // generate filtered intra prediction samples
- buf0[bufOffset - 1] = src[1];
- left = buf1 + bufOffset;
- for (int i = 0; i < tuSize2; i++)
- left[i] = (buf0[bufOffset + i - 1] + 2 * buf0[bufOffset + i] + buf0[bufOffset + i + 1] + 2) >> 2;
- left[tuSize2] = buf0[bufOffset + tuSize2];
+ pixel* fltBuf = intraNeighbours[1];
+ pixel topLeft = srcBuf[0], topLast = srcBuf[tuSize2], leftLast = srcBuf[tuSize2 + tuSize2];
- above = buf0 + bufOffset;
- above[0] = left[0];
+ // filtering top
for (int i = 1; i < tuSize2; i++)
- above[i] = (src[i - 1] + 2 * src[i] + src[i + 1] + 2) >> 2;
- above[tuSize2] = src[tuSize2];
- }
- else
- {
- above = buf1 + bufOffset;
- memcpy(above, src, (tuSize2 + 1) * sizeof(pixel));
+ fltBuf[i] = ((srcBuf[i] << 1) + srcBuf[i - 1] + srcBuf[i + 1] + 2) >> 2;
+ fltBuf[tuSize2] = topLast;
+
+ // filtering top-left
+ fltBuf[0] = ((srcBuf[0] << 1) + srcBuf[1] + srcBuf[tuSize2 + 1] + 2) >> 2;
+
+ //filtering left
+ fltBuf[tuSize2 + 1] = ((srcBuf[tuSize2 + 1] << 1) + topLeft + srcBuf[tuSize2 + 2] + 2) >> 2;
+ for (int i = tuSize2 + 2; i < tuSize2 + tuSize2; i++)
+ fltBuf[i] = ((srcBuf[i] << 1) + srcBuf[i - 1] + srcBuf[i + 1] + 2) >> 2;
+ fltBuf[tuSize2 + tuSize2] = leftLast;
+
+ srcBuf = intraNeighbours[1];
}
int sizeIdx = log2TrSizeC - 2;
X265_CHECK(sizeIdx >= 0 && sizeIdx < 4, "intra block size is out of range\n");
- primitives.intra_pred[dirMode][sizeIdx](dst, stride, left, above, dirMode, 0);
+
+ primitives.intra_pred_new[dirMode][sizeIdx](dst, stride, srcBuf, dirMode, 0);
}
void Predict::initMotionCompensation(const CUData& cu, const CUGeom& cuGeom, int partIdx)
@@ -660,37 +630,22 @@
IntraNeighbors intraNeighbors;
initIntraNeighbors(cu, absPartIdx, partDepth, true, &intraNeighbors);
- pixel* adiBuf = m_predBuf;
- pixel* refAbove = m_refAbove;
- pixel* refLeft = m_refLeft;
- pixel* refAboveFlt = m_refAboveFlt;
- pixel* refLeftFlt = m_refLeftFlt;
-
int tuSize = intraNeighbors.tuSize;
int tuSize2 = tuSize << 1;
pixel* adiOrigin = cu.m_encData->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
intptr_t picStride = cu.m_encData->m_reconPic->m_stride;
- fillReferenceSamples(adiOrigin, picStride, adiBuf, intraNeighbors);
+ fillReferenceSamples(adiOrigin, picStride, intraNeighbors, intraNeighbours[0]);
- // initialization of ADI buffers
- const int bufOffset = tuSize - 1;
- refAbove += bufOffset;
- refLeft += bufOffset;
+ pixel* refBuf = intraNeighbours[0];
+ pixel* fltBuf = intraNeighbours[1];
- memcpy(refAbove, adiBuf, (tuSize2 + 1) * sizeof(pixel));
-
- refLeft[0] = adiBuf[0];
- for (int k = 1; k < tuSize2 + 1 ; k++)
- refLeft[k] = adiBuf[k + tuSize2];
+ pixel topLeft = refBuf[0], topLast = refBuf[tuSize2], leftLast = refBuf[tuSize2 + tuSize2];
if (dirMode == ALL_IDX ? (8 | 16 | 32) & tuSize : g_intraFilterFlags[dirMode] & tuSize)
{
// generate filtered intra prediction samples
- refAboveFlt += bufOffset;
- refLeftFlt += bufOffset;
-
bool bStrongSmoothing = (tuSize == 32 && cu.m_slice->m_sps->bUseStrongIntraSmoothing);
if (bStrongSmoothing)
@@ -698,44 +653,49 @@
const int trSize = 32;
const int trSize2 = trSize << 1;
const int threshold = 1 << (X265_DEPTH - 5);
- int refBL = refLeft[trSize2];
- int refTL = refAbove[0];
- int refTR = refAbove[trSize2];
- bStrongSmoothing = (abs(refBL + refTL - (refLeft[trSize] << 1)) < threshold &&
- abs(refTL + refTR - (refAbove[trSize] << 1)) < threshold);
+
+ pixel topMiddle = refBuf[32], leftMiddle = refBuf[tuSize2 + 32];
+
+ bStrongSmoothing = abs (topLeft + topLast - (topMiddle << 1)) < threshold &&
+ abs (topLeft + leftLast - (leftMiddle << 1)) < threshold;
if (bStrongSmoothing)
{
// bilinear interpolation
- const int shift = 5 + 1; // intraNeighbors.log2TrSize + 1;
- int init = (refTL << shift) + tuSize;
+ const int shift = 5 + 1;
+ int init = (topLeft << shift) + tuSize;
int deltaL, deltaR;
- refLeftFlt[0] = refAboveFlt[0] = refAbove[0];
+ //TODO: Performance Primitive???
+ deltaL = leftLast - topLeft; deltaR = topLast - topLeft;
- //TODO: Performance Primitive???
- deltaL = refBL - refTL; deltaR = refTR - refTL;
+ fltBuf[0] = topLeft;
for (int i = 1; i < trSize2; i++)
{
- refLeftFlt[i] = (pixel)((init + deltaL * i) >> shift);
- refAboveFlt[i] = (pixel)((init + deltaR * i) >> shift);
+ fltBuf[i + tuSize2] = (pixel)((init + deltaL * i) >> shift); //Left Filtering
+ fltBuf[i] = (pixel)((init + deltaR * i) >> shift); //Above Filtering
}
- refLeftFlt[trSize2] = refLeft[trSize2];
- refAboveFlt[trSize2] = refAbove[trSize2];
+ fltBuf[trSize2] = topLast;
+ fltBuf[tuSize2 + trSize2] = leftLast;
return;
}
}
- refLeftFlt[0] = (refAbove[1] + (refLeft[0] << 1) + refLeft[1] + 2) >> 2;
+ // filtering top
for (int i = 1; i < tuSize2; i++)
- refLeftFlt[i] = (refLeft[i - 1] + (refLeft[i] << 1) + refLeft[i + 1] + 2) >> 2;
- refLeftFlt[tuSize2] = refLeft[tuSize2];
+ fltBuf[i] = ((refBuf[i] << 1) + refBuf[i - 1] + refBuf[i + 1] + 2) >> 2;
+ fltBuf[tuSize2] = topLast;
- refAboveFlt[0] = refLeftFlt[0];
- for (int i = 1; i < tuSize2; i++)
- refAboveFlt[i] = (refAbove[i - 1] + (refAbove[i] << 1) + refAbove[i + 1] + 2) >> 2;
- refAboveFlt[tuSize2] = refAbove[tuSize2];
+
+ // filtering top-left
+ fltBuf[0] = ((topLeft << 1) + refBuf[1] + refBuf[tuSize2 + 1] + 2) >> 2;
+
+ //filtering left
+ fltBuf[tuSize2 + 1] = ((refBuf[tuSize2 + 1] << 1) + topLeft + refBuf[tuSize2 + 2] + 2) >> 2;
+ for (int i = tuSize2 + 2; i < tuSize2 + tuSize2; i++)
+ fltBuf[i] = ((refBuf[i] << 1) + refBuf[i - 1] + refBuf[i + 1] + 2) >> 2;
+ fltBuf[tuSize2 + tuSize2] = leftLast;
}
}
@@ -743,13 +703,11 @@
{
IntraNeighbors intraNeighbors;
initIntraNeighbors(cu, absPartIdx, partDepth, false, &intraNeighbors);
- uint32_t tuSize = intraNeighbors.tuSize;
const pixel* adiOrigin = cu.m_encData->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
intptr_t picStride = cu.m_encData->m_reconPic->m_strideC;
- pixel* adiRef = getAdiChromaBuf(chromaId, tuSize);
- fillReferenceSamples(adiOrigin, picStride, adiRef, intraNeighbors);
+ fillReferenceSamples(adiOrigin, picStride, intraNeighbors, intraNeighbours[0]);
}
void Predict::initIntraNeighbors(const CUData& cu, uint32_t absPartIdx, uint32_t partDepth, bool isLuma, IntraNeighbors *intraNeighbors)
@@ -797,158 +755,157 @@
intraNeighbors->log2TrSize = log2TrSize;
}
-void Predict::fillReferenceSamples(const pixel* adiOrigin, intptr_t picStride, pixel* adiRef, const IntraNeighbors& intraNeighbors)
-{
- const pixel dcValue = (pixel)(1 << (X265_DEPTH - 1));
- int numIntraNeighbor = intraNeighbors.numIntraNeighbor;
- int totalUnits = intraNeighbors.totalUnits;
- uint32_t tuSize = intraNeighbors.tuSize;
- uint32_t refSize = tuSize * 2 + 1;
-
- // Nothing is available, perform DC prediction.
- if (numIntraNeighbor == 0)
- {
- // Fill border with DC value
- for (uint32_t i = 0; i < refSize; i++)
- adiRef[i] = dcValue;
-
- for (uint32_t i = 0; i < refSize - 1; i++)
- adiRef[i + refSize] = dcValue;
- }
- else if (numIntraNeighbor == totalUnits)
- {
- // Fill top border with rec. samples
- const pixel* adiTemp = adiOrigin - picStride - 1;
- memcpy(adiRef, adiTemp, refSize * sizeof(*adiRef));
-
- // Fill left border with rec. samples
- adiTemp = adiOrigin - 1;
-
- for (uint32_t i = 0; i < refSize - 1; i++)
- {
- adiRef[i + refSize] = adiTemp[0];
- adiTemp += picStride;
- }
- }
- else // reference samples are partially available
- {
- const bool* bNeighborFlags = intraNeighbors.bNeighborFlags;
- const bool* pNeighborFlags;
- int aboveUnits = intraNeighbors.aboveUnits;
- int leftUnits = intraNeighbors.leftUnits;
- int unitWidth = intraNeighbors.unitWidth;
- int unitHeight = intraNeighbors.unitHeight;
- int totalSamples = (leftUnits * unitHeight) + ((aboveUnits + 1) * unitWidth);
- pixel adiLineBuffer[5 * MAX_CU_SIZE];
- pixel* adi;
-
- // Initialize
- for (int i = 0; i < totalSamples; i++)
- adiLineBuffer[i] = dcValue;
-
- // Fill top-left sample
- const pixel* adiTemp = adiOrigin - picStride - 1;
- adi = adiLineBuffer + (leftUnits * unitHeight);
- pNeighborFlags = bNeighborFlags + leftUnits;
- if (*pNeighborFlags)
- {
- pixel topLeftVal = adiTemp[0];
- for (int i = 0; i < unitWidth; i++)
- adi[i] = topLeftVal;
- }
-
- // Fill left & below-left samples
- adiTemp += picStride;
- adi--;
- pNeighborFlags--;
- for (int j = 0; j < leftUnits; j++)
- {
- if (*pNeighborFlags)
- for (int i = 0; i < unitHeight; i++)
- adi[-i] = adiTemp[i * picStride];
-
- adiTemp += unitHeight * picStride;
- adi -= unitHeight;
- pNeighborFlags--;
- }
-
- // Fill above & above-right samples
- adiTemp = adiOrigin - picStride;
- adi = adiLineBuffer + (leftUnits * unitHeight) + unitWidth;
- pNeighborFlags = bNeighborFlags + leftUnits + 1;
- for (int j = 0; j < aboveUnits; j++)
- {
- if (*pNeighborFlags)
- memcpy(adi, adiTemp, unitWidth * sizeof(*adiTemp));
- adiTemp += unitWidth;
- adi += unitWidth;
- pNeighborFlags++;
- }
-
- // Pad reference samples when necessary
- int curr = 0;
- int next = 1;
- adi = adiLineBuffer;
- int pAdiLineTopRowOffset = leftUnits * (unitHeight - unitWidth);
- if (!bNeighborFlags[0])
- {
- // very bottom unit of bottom-left; at least one unit will be valid.
- while (next < totalUnits && !bNeighborFlags[next])
- next++;
-
- pixel* pAdiLineNext = adiLineBuffer + ((next < leftUnits) ? (next * unitHeight) : (pAdiLineTopRowOffset + (next * unitWidth)));
- const pixel refSample = *pAdiLineNext;
- // Pad unavailable samples with new value
- int nextOrTop = X265_MIN(next, leftUnits);
- // fill left column
- while (curr < nextOrTop)
- {
- for (int i = 0; i < unitHeight; i++)
- adi[i] = refSample;
-
- adi += unitHeight;
- curr++;
- }
-
- // fill top row
- while (curr < next)
- {
- for (int i = 0; i < unitWidth; i++)
- adi[i] = refSample;
-
- adi += unitWidth;
- curr++;
- }
- }
-
- // pad all other reference samples.
- while (curr < totalUnits)
- {
- if (!bNeighborFlags[curr]) // samples not available
- {
- int numSamplesInCurrUnit = (curr >= leftUnits) ? unitWidth : unitHeight;
- const pixel refSample = *(adi - 1);
- for (int i = 0; i < numSamplesInCurrUnit; i++)
- adi[i] = refSample;
-
- adi += numSamplesInCurrUnit;
- curr++;
- }
- else
- {
- adi += (curr >= leftUnits) ? unitWidth : unitHeight;
- curr++;
- }
- }
-
- // Copy processed samples
- adi = adiLineBuffer + refSize + unitWidth - 2;
- memcpy(adiRef, adi, refSize * sizeof(*adiRef));
-
- adi = adiLineBuffer + refSize - 1;
- for (int i = 0; i < (int)refSize - 1; i++)
- adiRef[i + refSize] = adi[-(i + 1)];
- }
+void Predict::fillReferenceSamples(const pixel* adiOrigin, intptr_t picStride, const IntraNeighbors& intraNeighbors, pixel dst[258])
+{
+ const pixel dcValue = (pixel)(1 << (X265_DEPTH - 1));
+ int numIntraNeighbor = intraNeighbors.numIntraNeighbor;
+ int totalUnits = intraNeighbors.totalUnits;
+ uint32_t tuSize = intraNeighbors.tuSize;
+ uint32_t refSize = tuSize * 2 + 1;
+
+ // Nothing is available, perform DC prediction.
+ if (numIntraNeighbor == 0)
+ {
+ // Fill border with DC value
+ for (uint32_t i = 0; i < refSize; i++)
+ dst[i] = dcValue;
+
+ for (uint32_t i = 0; i < refSize - 1; i++)
+ dst[i + refSize] = dcValue;
+ }
+ else if (numIntraNeighbor == totalUnits)
+ {
+ // Fill top border with rec. samples
+ const pixel* adiTemp = adiOrigin - picStride - 1;
+ memcpy(dst, adiTemp, refSize * sizeof(pixel));
+
+ // Fill left border with rec. samples
+ adiTemp = adiOrigin - 1;
+ for (uint32_t i = 0; i < refSize - 1; i++)
+ {
+ dst[i + refSize] = adiTemp[0];
+ adiTemp += picStride;
+ }
+ }
+ else // reference samples are partially available
+ {
+ const bool *bNeighborFlags = intraNeighbors.bNeighborFlags;
+ const bool *pNeighborFlags;
+ int aboveUnits = intraNeighbors.aboveUnits;
+ int leftUnits = intraNeighbors.leftUnits;
+ int unitWidth = intraNeighbors.unitWidth;
+ int unitHeight = intraNeighbors.unitHeight;
+ int totalSamples = (leftUnits * unitHeight) + ((aboveUnits + 1) * unitWidth);
+ pixel adiLineBuffer[5 * MAX_CU_SIZE];
+ pixel *adi;
+
+ // Initialize
+ for (int i = 0; i < totalSamples; i++)
+ adiLineBuffer[i] = dcValue;
+
+ // Fill top-left sample
+ const pixel* adiTemp = adiOrigin - picStride - 1;
+ adi = adiLineBuffer + (leftUnits * unitHeight);
+ pNeighborFlags = bNeighborFlags + leftUnits;
+ if (*pNeighborFlags)
+ {
+ pixel topLeftVal = adiTemp[0];
+ for (int i = 0; i < unitWidth; i++)
+ adi[i] = topLeftVal;
+ }
+
+ // Fill left & below-left samples
+ adiTemp += picStride;
+ adi--;
+ pNeighborFlags--;
+ for (int j = 0; j < leftUnits; j++)
+ {
+ if (*pNeighborFlags)
+ for (int i = 0; i < unitHeight; i++)
+ adi[-i] = adiTemp[i * picStride];
+
+ adiTemp += unitHeight * picStride;
+ adi -= unitHeight;
+ pNeighborFlags--;
+ }
+
+ // Fill above & above-right samples
+ adiTemp = adiOrigin - picStride;
+ adi = adiLineBuffer + (leftUnits * unitHeight) + unitWidth;
+ pNeighborFlags = bNeighborFlags + leftUnits + 1;
+ for (int j = 0; j < aboveUnits; j++)
+ {
+ if (*pNeighborFlags)
+ memcpy(adi, adiTemp, unitWidth * sizeof(*adiTemp));
+ adiTemp += unitWidth;
+ adi += unitWidth;
+ pNeighborFlags++;
+ }
+
+ // Pad reference samples when necessary
+ int curr = 0;
+ int next = 1;
+ adi = adiLineBuffer;
+ int pAdiLineTopRowOffset = leftUnits * (unitHeight - unitWidth);
+ if (!bNeighborFlags[0])
+ {
+ // very bottom unit of bottom-left; at least one unit will be valid.
+ while (next < totalUnits && !bNeighborFlags[next])
+ next++;
+
+ pixel* pAdiLineNext = adiLineBuffer + ((next < leftUnits) ? (next * unitHeight) : (pAdiLineTopRowOffset + (next * unitWidth)));
+ const pixel refSample = *pAdiLineNext;
+ // Pad unavailable samples with new value
+ int nextOrTop = X265_MIN(next, leftUnits);
+ // fill left column
+ while (curr < nextOrTop)
+ {
+ for (int i = 0; i < unitHeight; i++)
+ adi[i] = refSample;
+
+ adi += unitHeight;
+ curr++;
+ }
+
+ // fill top row
+ while (curr < next)
+ {
+ for (int i = 0; i < unitWidth; i++)
+ adi[i] = refSample;
+
+ adi += unitWidth;
+ curr++;
+ }
+ }
+
+ // pad all other reference samples.
+ while (curr < totalUnits)
+ {
+ if (!bNeighborFlags[curr]) // samples not available
+ {
+ int numSamplesInCurrUnit = (curr >= leftUnits) ? unitWidth : unitHeight;
+ const pixel refSample = *(adi - 1);
+ for (int i = 0; i < numSamplesInCurrUnit; i++)
+ adi[i] = refSample;
+
+ adi += numSamplesInCurrUnit;
+ curr++;
+ }
+ else
+ {
+ adi += (curr >= leftUnits) ? unitWidth : unitHeight;
+ curr++;
+ }
+ }
+
+ // Copy processed samples
+ adi = adiLineBuffer + refSize + unitWidth - 2;
+ memcpy(dst, adi, refSize * sizeof(pixel));
+
+ adi = adiLineBuffer + refSize - 1;
+ for (int i = 0; i < (int)refSize - 1; i++)
+ dst[i + refSize] = adi[-(i + 1)];
+ }
}
bool Predict::isAboveLeftAvailable(const CUData& cu, uint32_t partIdxLT)
diff -r b6146d08a1f1 -r 0b87552d51f9 source/common/predict.h
--- a/source/common/predict.h Tue Nov 25 13:04:58 2014 +0530
+++ b/source/common/predict.h Fri Nov 28 20:43:52 2014 +0530
@@ -40,8 +40,6 @@
{
public:
- enum { ADI_BUF_STRIDE = (2 * MAX_CU_SIZE + 1 + 15) }; // alignment to 16 bytes
-
/* Weighted prediction scaling values built from slice parameters (bitdepth scaled) */
struct WeightValues
{
@@ -64,12 +62,8 @@
ShortYuv m_predShortYuv[2]; /* temporary storage for weighted prediction */
int16_t* m_immedVals;
- /* Intra prediction buffers */
- pixel* m_predBuf;
- pixel* m_refAbove;
- pixel* m_refAboveFlt;
- pixel* m_refLeft;
- pixel* m_refLeftFlt;
+ // Unfiltered/filtered neighbours of the current partition.
+ pixel intraNeighbours[2][257];
/* Slice information */
const Slice* m_predSlice;
@@ -106,7 +100,7 @@
/* Intra prediction helper functions */
static void initIntraNeighbors(const CUData& cu, uint32_t zOrderIdxInPart, uint32_t partDepth, bool isLuma, IntraNeighbors *IntraNeighbors);
- static void fillReferenceSamples(const pixel* adiOrigin, intptr_t picStride, pixel* adiRef, const IntraNeighbors& intraNeighbors);
+ static void fillReferenceSamples(const pixel* adiOrigin, intptr_t picStride, const IntraNeighbors& intraNeighbors, pixel dst[258]);
static bool isAboveLeftAvailable(const CUData& cu, uint32_t partIdxLT);
static int isAboveAvailable(const CUData& cu, uint32_t partIdxLT, uint32_t partIdxRT, bool* bValidFlags);
@@ -123,14 +117,10 @@
/* Angular Intra */
void predIntraLumaAng(uint32_t dirMode, pixel* pred, intptr_t stride, uint32_t log2TrSize);
- void predIntraChromaAng(pixel* src, uint32_t dirMode, pixel* pred, intptr_t stride, uint32_t log2TrSizeC, int chFmt);
+ void predIntraChromaAng(uint32_t dirMode, pixel* pred, intptr_t stride, uint32_t log2TrSizeC, int chFmt);
void initAdiPattern(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t partDepth, int dirMode);
void initAdiPatternChroma(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t partDepth, uint32_t chromaId);
- pixel* getAdiChromaBuf(uint32_t chromaId, int tuSize)
- {
- return m_predBuf + (chromaId == 1 ? 0 : 2 * ADI_BUF_STRIDE * (tuSize * 2 + 1));
- }
};
}
diff -r b6146d08a1f1 -r 0b87552d51f9 source/common/primitives.h
--- a/source/common/primitives.h Tue Nov 25 13:04:58 2014 +0530
+++ b/source/common/primitives.h Fri Nov 28 20:43:52 2014 +0530
@@ -145,7 +145,9 @@
typedef void (*blockfill_s_t)(int16_t* dst, intptr_t dstride, int16_t val);
typedef void (*intra_pred_t)(pixel* dst, intptr_t dstStride, pixel* refLeft, pixel* refAbove, int dirMode, int bFilter);
+typedef void (*intra_pred_new_t)(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter);
typedef void (*intra_allangs_t)(pixel* dst, pixel* above0, pixel* left0, pixel* above1, pixel* left1, int bLuma);
+typedef void (*intra_allangs_new_t)(pixel *dst, pixel *refPix, pixel *filtPix, int bLuma);
typedef void (*cpy16to16_shl_t)(int16_t* dst, const int16_t* src, intptr_t, int, int);
typedef void (*cvt16to32_shl_t)(int32_t* dst, const int16_t* src, intptr_t, int, int);
@@ -254,8 +256,11 @@
addAvg_t luma_addAvg[NUM_LUMA_PARTITIONS];
intra_pred_t intra_pred[NUM_INTRA_MODE][NUM_TR_SIZE];
+ intra_pred_new_t intra_pred_new[NUM_INTRA_MODE][NUM_TR_SIZE];
intra_allangs_t intra_pred_allangs[NUM_TR_SIZE];
+ intra_allangs_new_t intra_pred_allangs_new[NUM_TR_SIZE];
scale_t scale1D_128to64;
+ scale_t scale1D_128to64_new;
scale_t scale2D_64to32;
dct_t dct[NUM_DCTS];
diff -r b6146d08a1f1 -r 0b87552d51f9 source/encoder/search.cpp
--- a/source/encoder/search.cpp Tue Nov 25 13:04:58 2014 +0530
+++ b/source/encoder/search.cpp Fri Nov 28 20:43:52 2014 +0530
@@ -793,7 +793,6 @@
return codeIntraChromaTSkip(mode, cuGeom, trDepth, trDepthC, absPartIdx, psyEnergy);
uint32_t qtLayer = log2TrSize - 2;
- uint32_t tuSize = 1 << log2TrSizeC;
uint32_t outDist = 0;
uint32_t curPartNum = NUM_CU_PARTITIONS >> ((cu.m_cuDepth[0] + trDepthC) << 1);
@@ -824,7 +823,6 @@
// init availability pattern
initAdiPatternChroma(cu, cuGeom, absPartIdxC, trDepthC, chromaId);
- pixel* chromaPred = getAdiChromaBuf(chromaId, tuSize);
uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
if (chromaPredMode == DM_CHROMA_IDX)
@@ -833,7 +831,7 @@
chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode];
// get prediction signal
- predIntraChromaAng(chromaPred, chromaPredMode, pred, stride, log2TrSizeC, m_csp);
+ predIntraChromaAng(chromaPredMode, pred, stride, log2TrSizeC, m_csp);
cu.setTransformSkipPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
@@ -877,7 +875,6 @@
uint32_t fullDepth = cu.m_cuDepth[0] + trDepth;
uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
uint32_t log2TrSizeC = 2;
- uint32_t tuSize = 4;
uint32_t qtLayer = log2TrSize - 2;
uint32_t outDist = 0;
@@ -914,7 +911,6 @@
// init availability pattern
initAdiPatternChroma(cu, cuGeom, absPartIdxC, trDepthC, chromaId);
- pixel* chromaPred = getAdiChromaBuf(chromaId, tuSize);
uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
if (chromaPredMode == DM_CHROMA_IDX)
@@ -923,7 +919,7 @@
chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode];
// get prediction signal
- predIntraChromaAng(chromaPred, chromaPredMode, pred, stride, log2TrSizeC, m_csp);
+ predIntraChromaAng(chromaPredMode, pred, stride, log2TrSizeC, m_csp);
uint64_t bCost = MAX_INT64;
uint32_t bDist = 0;
@@ -1084,7 +1080,6 @@
}
ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
- uint32_t tuSize = 1 << log2TrSizeC;
uint32_t stride = mode.fencYuv->m_csize;
const int sizeIdxC = log2TrSizeC - 2;
@@ -1114,9 +1109,8 @@
chromaPredMode = cu.m_lumaIntraDir[(m_csp == X265_CSP_I444) ? absPartIdxC : 0];
chromaPredMode = (m_csp == X265_CSP_I422) ? g_chroma422IntraAngleMappingTable[chromaPredMode] : chromaPredMode;
initAdiPatternChroma(cu, cuGeom, absPartIdxC, trDepthC, chromaId);
- pixel* chromaPred = getAdiChromaBuf(chromaId, tuSize);
-
- predIntraChromaAng(chromaPred, chromaPredMode, pred, stride, log2TrSizeC, m_csp);
+
+ predIntraChromaAng(chromaPredMode, pred, stride, log2TrSizeC, m_csp);
X265_CHECK(!cu.m_transformSkip[ttype][0], "transform skip not supported at low RD levels\n");
@@ -1221,10 +1215,6 @@
const pixel* fenc = intraMode.fencYuv->m_buf[0];
uint32_t stride = intraMode.fencYuv->m_size;
- pixel* above = m_refAbove + tuSize - 1;
- pixel* aboveFiltered = m_refAboveFlt + tuSize - 1;
- pixel* left = m_refLeft + tuSize - 1;
- pixel* leftFiltered = m_refLeftFlt + tuSize - 1;
int sad, bsad;
uint32_t bits, bbits, mode, bmode;
uint64_t cost, bcost;
@@ -1244,26 +1234,23 @@
primitives.scale2D_64to32(bufScale, fenc, stride);
fenc = bufScale;
- // reserve space in case primitives need to store data in above
- // or left buffers
- pixel _above[4 * 32 + 1];
- pixel _left[4 * 32 + 1];
- pixel* aboveScale = _above + 2 * 32;
- pixel* leftScale = _left + 2 * 32;
- aboveScale[0] = leftScale[0] = above[0];
- primitives.scale1D_128to64(aboveScale + 1, above + 1, 0);
- primitives.scale1D_128to64(leftScale + 1, left + 1, 0);
+ pixel nScale[129];
+ intraNeighbours[1][0] = intraNeighbours[0][0];
+ primitives.scale1D_128to64_new(nScale + 1, intraNeighbours[0] + 1, 0);
+
+ // Primitive
+ for (int x = 1; x < 65; x++)
+ {
+ intraNeighbours[0][x] = nScale[x]; // Top pixel
+ intraNeighbours[0][x + 64] = nScale[x + 64]; // Left pixel
+ intraNeighbours[1][x] = nScale[x]; // Top pixel
+ intraNeighbours[1][x + 64] = nScale[x + 64]; // Left pixel
+ }
scaleTuSize = 32;
scaleStride = 32;
costShift = 2;
- sizeIdx = 5 - 2; // log2(scaleTuSize) - 2
-
- // Filtered and Unfiltered refAbove and refLeft pointing to above and left.
- above = aboveScale;
- left = leftScale;
- aboveFiltered = aboveScale;
- leftFiltered = leftScale;
+ sizeIdx = 5 - 2;
}
pixelcmp_t sa8d = primitives.sa8d[sizeIdx];
@@ -1280,23 +1267,20 @@
uint32_t rbits = getIntraRemModeBits(cu, absPartIdx, preds, mpms);
// DC
- primitives.intra_pred[DC_IDX][sizeIdx](tmp, scaleStride, left, above, 0, (scaleTuSize <= 16));
+// primitives.intra_pred[DC_IDX][sizeIdx](tmp, scaleStride, left, above, 0, (scaleTuSize <= 16));
+ primitives.intra_pred_new[DC_IDX][sizeIdx](tmp, scaleStride, intraNeighbours[0], 0, (scaleTuSize <= 16));
bsad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
bmode = mode = DC_IDX;
bbits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits;
bcost = m_rdCost.calcRdSADCost(bsad, bbits);
- pixel* abovePlanar = above;
- pixel* leftPlanar = left;
-
+ // PLANAR
+ pixel* planar = intraNeighbours[0];
if (tuSize & (8 | 16 | 32))
- {
- abovePlanar = aboveFiltered;
- leftPlanar = leftFiltered;
- }
-
- // PLANAR
- primitives.intra_pred[PLANAR_IDX][sizeIdx](tmp, scaleStride, leftPlanar, abovePlanar, 0, 0);
+ planar = intraNeighbours[1];
+
+// primitives.intra_pred[PLANAR_IDX][sizeIdx](tmp, scaleStride, leftPlanar, abovePlanar, 0, 0);
+ primitives.intra_pred_new[PLANAR_IDX][sizeIdx](tmp, scaleStride, planar, 0, 0);
sad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
mode = PLANAR_IDX;
bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits;
@@ -1306,7 +1290,8 @@
// Transpose NxN
primitives.transpose[sizeIdx](bufTrans, fenc, scaleStride);
- primitives.intra_pred_allangs[sizeIdx](tmp, above, left, aboveFiltered, leftFiltered, (scaleTuSize <= 16));
+// primitives.intra_pred_allangs[sizeIdx](tmp, above, left, aboveFiltered, leftFiltered, (scaleTuSize <= 16));
+ primitives.intra_pred_allangs_new[sizeIdx](tmp, intraNeighbours[0], intraNeighbours[1], (scaleTuSize <= 16));
bool modeHor;
const pixel* cmp;
@@ -1452,46 +1437,37 @@
const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
uint32_t stride = predYuv->m_size;
- pixel* above = m_refAbove + tuSize - 1;
- pixel* aboveFiltered = m_refAboveFlt + tuSize - 1;
- pixel* left = m_refLeft + tuSize - 1;
- pixel* leftFiltered = m_refLeftFlt + tuSize - 1;
-
// 33 Angle modes once
ALIGN_VAR_32(pixel, buf_trans[32 * 32]);
ALIGN_VAR_32(pixel, tmp[33 * 32 * 32]);
- ALIGN_VAR_32(pixel, bufScale[32 * 32]);
- pixel _above[4 * 32 + 1];
- pixel _left[4 * 32 + 1];
int scaleTuSize = tuSize;
int scaleStride = stride;
int costShift = 0;
if (tuSize > 32)
{
- pixel* aboveScale = _above + 2 * 32;
- pixel* leftScale = _left + 2 * 32;
-
// origin is 64x64, we scale to 32x32 and setup required parameters
+ ALIGN_VAR_32(pixel, bufScale[32 * 32]);
primitives.scale2D_64to32(bufScale, fenc, stride);
fenc = bufScale;
- // reserve space in case primitives need to store data in above
- // or left buffers
- aboveScale[0] = leftScale[0] = above[0];
- primitives.scale1D_128to64(aboveScale + 1, above + 1, 0);
- primitives.scale1D_128to64(leftScale + 1, left + 1, 0);
+ pixel nScale[129];
+ intraNeighbours[1][0] = intraNeighbours[0][0];
+ primitives.scale1D_128to64_new(nScale + 1, intraNeighbours[0] + 1, 0);
+
+ // Primitive
+ for (int x = 1; x < 65; x++)
+ {
+ intraNeighbours[0][x] = nScale[x]; // Top pixel
+ intraNeighbours[0][x + 64] = nScale[x + 64]; // Left pixel
+ intraNeighbours[1][x] = nScale[x]; // Top pixel
+ intraNeighbours[1][x + 64] = nScale[x + 64]; // Left pixel
+ }
scaleTuSize = 32;
scaleStride = 32;
costShift = 2;
- sizeIdx = 5 - 2; // log2(scaleTuSize) - 2
-
- // Filtered and Unfiltered refAbove and refLeft pointing to above and left.
- above = aboveScale;
- left = leftScale;
- aboveFiltered = aboveScale;
- leftFiltered = leftScale;
+ sizeIdx = 5 - 2;
}
m_entropyCoder.loadIntraDirModeLuma(m_rqt[depth].cur);
@@ -1509,27 +1485,27 @@
uint64_t bcost;
// DC
- primitives.intra_pred[DC_IDX][sizeIdx](tmp, scaleStride, left, above, 0, (scaleTuSize <= 16));
+// primitives.intra_pred[DC_IDX][sizeIdx](tmp, scaleStride, left, above, 0, (scaleTuSize <= 16));
+ primitives.intra_pred_new[DC_IDX][sizeIdx](tmp, scaleStride, intraNeighbours[0], 0, (scaleTuSize <= 16));
uint32_t bits = (mpms & ((uint64_t)1 << DC_IDX)) ? m_entropyCoder.bitsIntraModeMPM(preds, DC_IDX) : rbits;
uint32_t sad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
modeCosts[DC_IDX] = bcost = m_rdCost.calcRdSADCost(sad, bits);
// PLANAR
- pixel* abovePlanar = above;
- pixel* leftPlanar = left;
+ pixel* planar = intraNeighbours[0];
if (tuSize >= 8 && tuSize <= 32)
- {
- abovePlanar = aboveFiltered;
- leftPlanar = leftFiltered;
- }
- primitives.intra_pred[PLANAR_IDX][sizeIdx](tmp, scaleStride, leftPlanar, abovePlanar, 0, 0);
+ planar = intraNeighbours[1];
+
+// primitives.intra_pred[PLANAR_IDX][sizeIdx](tmp, scaleStride, leftPlanar, abovePlanar, 0, 0);
+ primitives.intra_pred_new[PLANAR_IDX][sizeIdx](tmp, scaleStride, planar, 0, 0);
bits = (mpms & ((uint64_t)1 << PLANAR_IDX)) ? m_entropyCoder.bitsIntraModeMPM(preds, PLANAR_IDX) : rbits;
sad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
modeCosts[PLANAR_IDX] = m_rdCost.calcRdSADCost(sad, bits);
COPY1_IF_LT(bcost, modeCosts[PLANAR_IDX]);
// angular predictions
- primitives.intra_pred_allangs[sizeIdx](tmp, above, left, aboveFiltered, leftFiltered, (scaleTuSize <= 16));
+// primitives.intra_pred_allangs[sizeIdx](tmp, above, left, aboveFiltered, leftFiltered, (scaleTuSize <= 16));
+ primitives.intra_pred_allangs_new[sizeIdx](tmp, intraNeighbours[0], intraNeighbours[1], (scaleTuSize <= 16));
primitives.transpose[sizeIdx](buf_trans, fenc, scaleStride);
for (int mode = 2; mode < 35; mode++)
@@ -1643,8 +1619,8 @@
log2TrSizeC = 5;
}
- Predict::initAdiPatternChroma(cu, cuGeom, 0, 0, 1);
- Predict::initAdiPatternChroma(cu, cuGeom, 0, 0, 2);
+// Predict::initAdiPatternChroma(cu, cuGeom, 0, 0, 1);
+// Predict::initAdiPatternChroma(cu, cuGeom, 0, 0, 2);
cu.getAllowedChromaDir(0, modeList);
// check chroma modes
@@ -1661,10 +1637,11 @@
{
const pixel* fenc = fencYuv->m_buf[chromaId];
pixel* pred = predYuv->m_buf[chromaId];
- pixel* chromaPred = getAdiChromaBuf(chromaId, scaleTuSize);
+// pixel* chromaPred = getAdiChromaBuf(chromaId, scaleTuSize);
+ Predict::initAdiPatternChroma(cu, cuGeom, 0, 0, chromaId);
// get prediction signal
- predIntraChromaAng(chromaPred, chromaPredMode, pred, fencYuv->m_csize, log2TrSizeC, m_csp);
+ predIntraChromaAng(chromaPredMode, pred, fencYuv->m_csize, log2TrSizeC, m_csp);
cost += primitives.sa8d[log2TrSizeC - 2](fenc, predYuv->m_csize, pred, fencYuv->m_csize) << costShift;
}
diff -r b6146d08a1f1 -r 0b87552d51f9 source/encoder/slicetype.cpp
--- a/source/encoder/slicetype.cpp Tue Nov 25 13:04:58 2014 +0530
+++ b/source/encoder/slicetype.cpp Fri Nov 28 20:43:52 2014 +0530
@@ -1626,39 +1626,37 @@
if (!fenc->bIntraCalculated)
{
const int sizeIdx = X265_LOWRES_CU_BITS - 2; // partition size
-
- pixel _above0[X265_LOWRES_CU_SIZE * 4 + 1], *const above0 = _above0 + 2 * X265_LOWRES_CU_SIZE;
- pixel _above1[X265_LOWRES_CU_SIZE * 4 + 1], *const above1 = _above1 + 2 * X265_LOWRES_CU_SIZE;
- pixel _left0[X265_LOWRES_CU_SIZE * 4 + 1], *const left0 = _left0 + 2 * X265_LOWRES_CU_SIZE;
- pixel _left1[X265_LOWRES_CU_SIZE * 4 + 1], *const left1 = _left1 + 2 * X265_LOWRES_CU_SIZE;
+ const int cuSize2 = cuSize << 1;
+ pixel neighbours[2][X265_LOWRES_CU_SIZE * 4 + 1];
pixel *pix_cur = fenc->lowresPlane[0] + pelOffset;
// Copy Above
- memcpy(above0, pix_cur - 1 - fenc->lumaStride, (cuSize + 1) * sizeof(pixel));
+ memcpy(neighbours[0], pix_cur - 1 - fenc->lumaStride, (cuSize + 1) * sizeof(pixel));
// Copy Left
- for (int i = 0; i < cuSize + 1; i++)
- {
- left0[i] = pix_cur[-1 - fenc->lumaStride + i * fenc->lumaStride];
- }
+ for (int i = 1; i < cuSize + 1; i++)
+ neighbours[0][i + cuSize2] = pix_cur[-1 - fenc->lumaStride + i * fenc->lumaStride];
for (int i = 0; i < cuSize; i++)
{
- above0[cuSize + i + 1] = above0[cuSize];
- left0[cuSize + i + 1] = left0[cuSize];
+ // Copy above-last pixel
+ neighbours[0][i + cuSize + 1] = neighbours[0][cuSize]; //neighbours[0][i + 9] = neighbours[0][8]
+ // Copy left-last pixel
+ neighbours[0][i + cuSize2 + cuSize + 1] = neighbours[0][cuSize2 + cuSize]; //neighbours[0][i + 25] = neighbours[0][24]
}
- // filtering with [1 2 1]
- // assume getUseStrongIntraSmoothing() is disabled
- above1[0] = above0[0];
- above1[2 * cuSize] = above0[2 * cuSize];
- left1[0] = left0[0];
- left1[2 * cuSize] = left0[2 * cuSize];
- for (int i = 1; i < 2 * cuSize; i++)
+ // Filter neighbour pixels
+ neighbours[1][0] = neighbours[0][0]; // Copy top-left pixel
+ neighbours[1][cuSize2] = neighbours[0][cuSize2]; //Copy top-right pixel
+ neighbours[1][cuSize2 << 1] = neighbours[0][cuSize2 << 1]; // Bottom-left pixel
+
+ neighbours[1][1] = (neighbours[0][0] + (neighbours[0][1] << 1) + neighbours[0][2] + 2) >> 2;
+ neighbours[1][cuSize2 + 1] = (neighbours[0][0] + (neighbours[0][cuSize2 + 1] << 1) + neighbours[0][cuSize2 + 1 + 1] + 2) >> 2;
+ for (int i = 2; i < cuSize2; i++)
{
- above1[i] = (above0[i - 1] + 2 * above0[i] + above0[i + 1] + 2) >> 2;
- left1[i] = (left0[i - 1] + 2 * left0[i] + left0[i + 1] + 2) >> 2;
+ neighbours[1][i] = (neighbours[0][i - 1] + (neighbours[0][i] << 1) + neighbours[0][i + 1] + 2) >> 2;
+ neighbours[1][cuSize2 + i] = (neighbours[0][cuSize2 + i - 1] + (neighbours[0][cuSize2 + i] << 1) + neighbours[0][cuSize2 + i + 1] + 2) >> 2;
}
int predsize = cuSize * cuSize;
@@ -1666,18 +1664,19 @@
// generate 35 intra predictions into m_predictions
pixelcmp_t satd = primitives.satd[partitionFromLog2Size(X265_LOWRES_CU_BITS)];
int icost = m_me.COST_MAX;
- primitives.intra_pred[DC_IDX][sizeIdx](m_predictions, cuSize, left0, above0, 0, (cuSize <= 16));
+
+ primitives.intra_pred_new[DC_IDX][sizeIdx](m_predictions, cuSize, neighbours[0], 0, (cuSize <= 16));
int cost = satd(m_me.fenc, FENC_STRIDE, m_predictions, cuSize);
if (cost < icost)
icost = cost;
- pixel *above = (cuSize >= 8) ? above1 : above0;
- pixel *left = (cuSize >= 8) ? left1 : left0;
- primitives.intra_pred[PLANAR_IDX][sizeIdx](m_predictions, cuSize, left, above, 0, 0);
+
+ pixel *planar = (cuSize >= 8) ? neighbours[1] : neighbours[0];
+ primitives.intra_pred_new[PLANAR_IDX][sizeIdx](m_predictions, cuSize, planar, 0, 0);
cost = satd(m_me.fenc, FENC_STRIDE, m_predictions, cuSize);
if (cost < icost)
icost = cost;
- primitives.intra_pred_allangs[sizeIdx](m_predictions + 2 * predsize, above0, left0, above1, left1, (cuSize <= 16));
+ primitives.intra_pred_allangs_new[sizeIdx](m_predictions + 2 * predsize, neighbours[0], neighbours[1], (cuSize <= 16));
// calculate satd costs, keep least cost
ALIGN_VAR_32(pixel, buf_trans[32 * 32]);
primitives.transpose[sizeIdx](buf_trans, m_me.fenc, FENC_STRIDE);
More information about the x265-devel
mailing list