[x265] [PATCH 02 of 29] intra refactoring: intra prediction with modified planar, angular prediction functions

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Tue Jan 13 08:11:10 CET 2015


# HG changeset patch
# User Ashok Kumar Mishra<ashok at multicorewareinc.com>
# Date 1421075255 -19800
#      Mon Jan 12 20:37:35 2015 +0530
# Node ID cf29bf7824491d35e20df5249810ff9a1520d3e3
# Parent  9ec15ed0fe2a56eaf0c45954b750cac7293ffcd9
intra refactoring: intra prediction with modified planar, angular prediction functions
Buffer used for intra prediction(around 2  (1 + 2  128) bytes) is reduced from (1152 * 1056 + 768 bytes, roughly calculated).
Planar and angular prediction functions are modified. This patch is used modified C functions for all intra prediction finctions.

diff -r 9ec15ed0fe2a -r cf29bf782449 source/common/intrapred.cpp
--- a/source/common/intrapred.cpp	Mon Jan 12 14:49:22 2015 +0530
+++ b/source/common/intrapred.cpp	Mon Jan 12 20:37:35 2015 +0530
@@ -76,6 +76,25 @@
         dcPredFilter(above + 1, left + 1, dst, dstStride, width);
 }
 
+template<int width>
+void intra_pred_dc_c_new(pixel* dst, intptr_t dstStride, pixel* srcPix, int /*dirMode*/, int bFilter)
+{
+    int k, l;
+
+    int dcVal = width;
+    for (int i = 0; i < width; i++)
+        dcVal += srcPix[1 + i] + srcPix[2 * width + 1 + i];
+
+    dcVal = dcVal / (width + width);
+    for (k = 0; k < width; k++)
+        for (l = 0; l < width; l++)
+            dst[k * dstStride + l] = (pixel)dcVal;
+
+    if (bFilter)
+        dcPredFilter(srcPix + 1, srcPix + (2 * width + 1), dst, dstStride, width);
+
+}
+
 template<int log2Size>
 void planar_pred_c(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int /*dirMode*/, int /*bFilter*/)
 {
@@ -122,7 +141,20 @@
         }
     }
 }
+template<int log2Size>
+void planar_pred_c_new(pixel* dst, intptr_t dstStride, pixel* srcPix, int /*dirMode*/, int /*bFilter*/)
+{
+    const int blkSize = 1 << log2Size;
 
+    pixel* above = srcPix + 1;
+    pixel* left  = srcPix + (2 * blkSize + 1);
+
+    pixel topRight = above[blkSize];
+    pixel bottomLeft = left[blkSize];
+    for (int y = 0; y < blkSize; y++)
+        for (int x = 0; x < blkSize; x++)
+            dst[y * dstStride + x] = (pixel) (((blkSize - 1 - x) * left[y] + (blkSize - 1 -y) * above[x] + (x + 1) * topRight + (y + 1) * bottomLeft + blkSize) >> (log2Size + 1));
+}
 template<int width>
 void intra_pred_ang_c(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
 {
@@ -228,6 +260,107 @@
     }
 }
 
+template<int width>
+void intra_pred_ang_c_new(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
+{
+    int width2 = width << 1;
+    // Flip the neighbours in the horizontal case.
+    int horMode = dirMode < 18;
+    pixel neighbourBuf[129];
+
+    if (horMode)
+    {
+        neighbourBuf[0] = srcPix[0];
+        for (int i = 0; i < width << 1; i++)
+        {
+            neighbourBuf[1 + i] = srcPix[width2 + 1 + i];
+            neighbourBuf[width2 + 1 + i] = srcPix[1 + i];
+        }
+        srcPix = neighbourBuf;
+    }
+
+    // Intra prediction angle and inverse angle tables.
+    const int8_t angleTable[17] = { -32, -26, -21, -17, -13, -9, -5, -2, 0, 2, 5, 9, 13, 17, 21, 26, 32 };
+    const int16_t invAngleTable[8] = { 4096, 1638, 910, 630, 482, 390, 315, 256 };
+
+    // Get the prediction angle.
+    int angleOffset = horMode ? 10 - dirMode : dirMode - 26;
+    int angle = angleTable[8 + angleOffset];
+
+    // Vertical Prediction.
+    if (!angle)
+    {
+        for (int y = 0; y < width; y++)
+            for (int x = 0; x < width; x++)
+                dst[y * dstStride + x] = srcPix[1 + x];
+
+        if (bFilter)
+        {
+            int topLeft = srcPix[0], top = srcPix[1];
+            for (int y = 0; y < width; y++)
+                dst[y * dstStride] = x265_clip((int16_t)(top + ((srcPix[width2 + 1 + y] - topLeft) >> 1)));
+        }
+    }
+    else // Angular prediction.
+    {
+        // Get the reference pixels. The reference base is the first pixel to the top (neighbourBuf[1]).
+        pixel refBuf[64], *ref;
+
+        // Use the projected left neighbours and the top neighbours.
+        if (angle < 0)
+        {
+            // Number of neighbours projected. 
+            int nbProjected = -((width * angle) >> 5) - 1;
+            ref = refBuf + nbProjected + 1;
+
+            // Project the neighbours.
+            int invAngle = invAngleTable[- angleOffset - 1];
+            int invAngleSum = 128;
+            for (int i = 0; i < nbProjected; i++)
+            {
+                invAngleSum += invAngle;
+                ref[- 2 - i] = srcPix[width2 + (invAngleSum >> 8)];
+            }
+
+            // Copy the top-left and top pixels.
+            for (int i = 0; i < width + 1; i++)
+                ref[-1 + i] = srcPix[i];
+        }
+        else // Use the top and top-right neighbours.
+            ref = srcPix + 1;
+
+        // Pass every row.
+        int angleSum = 0;
+        for (int y = 0; y < width; y++)
+        {
+            angleSum += angle;
+            int offset = angleSum >> 5;
+            int fraction = angleSum & 31;
+
+            if (fraction) // Interpolate
+                for (int x = 0; x < width; x++)
+                    dst[y * dstStride + x] = (pixel)(((32 - fraction) * ref[offset + x] + fraction * ref[offset + x + 1] + 16) >> 5);
+            else // Copy.
+                for (int x = 0; x < width; x++)
+                    dst[y * dstStride + x] = ref[offset + x];
+        }
+    }
+
+    // Flip for horizontal.
+    if (horMode)
+    {
+        for (int y = 0; y < width - 1; y++)
+        {
+            for (int x = y + 1; x < width; x++)
+            {
+                pixel tmp              = dst[y * dstStride + x];
+                dst[y * dstStride + x] = dst[x * dstStride + y];
+                dst[x * dstStride + y] = tmp;
+            }
+        }
+    }
+}
+
 template<int log2Size>
 void all_angs_pred_c(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, int bLuma)
 {
@@ -258,6 +391,36 @@
         }
     }
 }
+
+template<int log2Size>
+void all_angs_pred_c_new(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
+{
+    const int size = 1 << log2Size;
+    for (int mode = 2; mode <= 34; mode++)
+    {
+        pixel *srcPix  = (g_intraFilterFlags[mode] & size ? filtPix  : refPix);
+        pixel *out = dest + ((mode - 2) << (log2Size * 2));
+
+        intra_pred_ang_c_new<size>(out, size, srcPix, mode, bLuma);
+
+        // Optimize code don't flip buffer
+        bool modeHor = (mode < 18);
+
+        // transpose the block if this is a horizontal mode
+        if (modeHor)
+        {
+            for (int k = 0; k < size - 1; k++)
+            {
+                for (int l = k + 1; l < size; l++)
+                {
+                    pixel tmp         = out[k * size + l];
+                    out[k * size + l] = out[l * size + k];
+                    out[l * size + k] = tmp;
+                }
+            }
+        }
+    }
+}
 }
 
 namespace x265 {
@@ -270,22 +433,43 @@
     p.intra_pred[0][BLOCK_16x16] = planar_pred_c<4>;
     p.intra_pred[0][BLOCK_32x32] = planar_pred_c<5>;
 
+    p.intra_pred_new[0][BLOCK_4x4] = planar_pred_c_new<2>;
+    p.intra_pred_new[0][BLOCK_8x8] = planar_pred_c_new<3>;
+    p.intra_pred_new[0][BLOCK_16x16] = planar_pred_c_new<4>;
+    p.intra_pred_new[0][BLOCK_32x32] = planar_pred_c_new<5>;
+
     // Intra Prediction DC
     p.intra_pred[1][BLOCK_4x4] = intra_pred_dc_c<4>;
     p.intra_pred[1][BLOCK_8x8] = intra_pred_dc_c<8>;
     p.intra_pred[1][BLOCK_16x16] = intra_pred_dc_c<16>;
     p.intra_pred[1][BLOCK_32x32] = intra_pred_dc_c<32>;
+
+    p.intra_pred_new[1][BLOCK_4x4] = intra_pred_dc_c_new<4>;
+    p.intra_pred_new[1][BLOCK_8x8] = intra_pred_dc_c_new<8>;
+    p.intra_pred_new[1][BLOCK_16x16] = intra_pred_dc_c_new<16>;
+    p.intra_pred_new[1][BLOCK_32x32] = intra_pred_dc_c_new<32>;
+
     for (int i = 2; i < NUM_INTRA_MODE; i++)
     {
         p.intra_pred[i][BLOCK_4x4] = intra_pred_ang_c<4>;
         p.intra_pred[i][BLOCK_8x8] = intra_pred_ang_c<8>;
         p.intra_pred[i][BLOCK_16x16] = intra_pred_ang_c<16>;
         p.intra_pred[i][BLOCK_32x32] = intra_pred_ang_c<32>;
+
+        p.intra_pred_new[i][BLOCK_4x4] = intra_pred_ang_c_new<4>;
+        p.intra_pred_new[i][BLOCK_8x8] = intra_pred_ang_c_new<8>;
+        p.intra_pred_new[i][BLOCK_16x16] = intra_pred_ang_c_new<16>;
+        p.intra_pred_new[i][BLOCK_32x32] = intra_pred_ang_c_new<32>;
     }
 
     p.intra_pred_allangs[BLOCK_4x4] = all_angs_pred_c<2>;
     p.intra_pred_allangs[BLOCK_8x8] = all_angs_pred_c<3>;
     p.intra_pred_allangs[BLOCK_16x16] = all_angs_pred_c<4>;
     p.intra_pred_allangs[BLOCK_32x32] = all_angs_pred_c<5>;
+
+    p.intra_pred_allangs_new[BLOCK_4x4] = all_angs_pred_c_new<2>;
+    p.intra_pred_allangs_new[BLOCK_8x8] = all_angs_pred_c_new<3>;
+    p.intra_pred_allangs_new[BLOCK_16x16] = all_angs_pred_c_new<4>;
+    p.intra_pred_allangs_new[BLOCK_32x32] = all_angs_pred_c_new<5>;
 }
 }
diff -r 9ec15ed0fe2a -r cf29bf782449 source/common/pixel.cpp
--- a/source/common/pixel.cpp	Mon Jan 12 14:49:22 2015 +0530
+++ b/source/common/pixel.cpp	Mon Jan 12 20:37:35 2015 +0530
@@ -643,6 +643,32 @@
     }
 }
 
+void scale1D_128to64_new(pixel *dst, const pixel *src, intptr_t /*stride*/)
+{
+    int x;
+    const pixel* src1 = src;
+    const pixel* src2 = src + 128;
+
+    pixel* dst1 = dst;
+    pixel* dst2 = dst + 64/*128*/;
+
+    for (x = 0; x < 128; x += 2)
+    {
+        // Top pixel
+        pixel pix0 = src1[(x + 0)];
+        pixel pix1 = src1[(x + 1)];
+
+        // Left pixel
+        pixel pix2 = src2[(x + 0)];
+        pixel pix3 = src2[(x + 1)];
+        int sum1 = pix0 + pix1;
+        int sum2 = pix2 + pix3;
+
+        dst1[x >> 1] = (pixel)((sum1 + 1) >> 1);
+        dst2[x >> 1] = (pixel)((sum2 + 1) >> 1);
+    }
+}
+
 void scale2D_64to32(pixel* dst, const pixel* src, intptr_t stride)
 {
     uint32_t x, y;
@@ -1366,6 +1392,7 @@
     p.weight_sp = weight_sp_c;
 
     p.scale1D_128to64 = scale1D_128to64;
+    p.scale1D_128to64_new = scale1D_128to64_new;
     p.scale2D_64to32 = scale2D_64to32;
     p.frameInitLowres = frame_init_lowres_core;
     p.ssim_4x4x2_core = ssim_4x4x2_core;
diff -r 9ec15ed0fe2a -r cf29bf782449 source/common/predict.cpp
--- a/source/common/predict.cpp	Mon Jan 12 14:49:22 2015 +0530
+++ b/source/common/predict.cpp	Mon Jan 12 20:37:35 2015 +0530
@@ -40,18 +40,11 @@
 
 Predict::Predict()
 {
-    m_predBuf = NULL;
-    m_refAbove = NULL;
-    m_refAboveFlt = NULL;
-    m_refLeft = NULL;
-    m_refLeftFlt = NULL;
     m_immedVals = NULL;
 }
 
 Predict::~Predict()
 {
-    X265_FREE(m_predBuf);
-    X265_FREE(m_refAbove);
     X265_FREE(m_immedVals);
     m_predShortYuv[0].destroy();
     m_predShortYuv[1].destroy();
@@ -62,16 +55,7 @@
     m_csp = csp;
     m_hChromaShift = CHROMA_H_SHIFT(csp);
     m_vChromaShift = CHROMA_V_SHIFT(csp);
-
-    int predBufHeight = ((MAX_CU_SIZE + 2) << 4);
-    int predBufStride = ((MAX_CU_SIZE + 8) << 4);
-    CHECKED_MALLOC(m_predBuf, pixel, predBufStride * predBufHeight);
     CHECKED_MALLOC(m_immedVals, int16_t, 64 * (64 + NTAPS_LUMA - 1));
-    CHECKED_MALLOC(m_refAbove, pixel, 12 * MAX_CU_SIZE);
-
-    m_refAboveFlt = m_refAbove + 3 * MAX_CU_SIZE;
-    m_refLeft = m_refAboveFlt + 3 * MAX_CU_SIZE;
-    m_refLeftFlt = m_refLeft + 3 * MAX_CU_SIZE;
 
     return m_predShortYuv[0].create(MAX_CU_SIZE, csp) && m_predShortYuv[1].create(MAX_CU_SIZE, csp);
 
@@ -82,68 +66,48 @@
 void Predict::predIntraLumaAng(uint32_t dirMode, pixel* dst, intptr_t stride, uint32_t log2TrSize)
 {
     int tuSize = 1 << log2TrSize;
-
-    pixel *refLft, *refAbv;
-
-    if (!(g_intraFilterFlags[dirMode] & tuSize))
-    {
-        refLft = m_refLeft + tuSize - 1;
-        refAbv = m_refAbove + tuSize - 1;
-    }
-    else
-    {
-        refLft = m_refLeftFlt + tuSize - 1;
-        refAbv = m_refAboveFlt + tuSize - 1;
-    }
+    pixel* srcPix = (!(g_intraFilterFlags[dirMode] & tuSize)) ? intraNeighbourBuf[0] : intraNeighbourBuf[1];
 
     bool bFilter = log2TrSize <= 4;
     int sizeIdx = log2TrSize - 2;
     X265_CHECK(sizeIdx >= 0 && sizeIdx < 4, "intra block size is out of range\n");
-    primitives.intra_pred[dirMode][sizeIdx](dst, stride, refLft, refAbv, dirMode, bFilter);
+//    primitives.intra_pred[dirMode][sizeIdx](dst, stride, refLft, refAbv, dirMode, bFilter);
+    primitives.intra_pred_new[dirMode][sizeIdx](dst, stride, srcPix, dirMode, bFilter);
 }
 
-void Predict::predIntraChromaAng(pixel* src, uint32_t dirMode, pixel* dst, intptr_t stride, uint32_t log2TrSizeC, int chFmt)
+void Predict::predIntraChromaAng(uint32_t dirMode, pixel* dst, intptr_t stride, uint32_t log2TrSizeC, int chFmt)
 {
     int tuSize = 1 << log2TrSizeC;
     int tuSize2 = tuSize << 1;
 
-    // Create the prediction
-    const int bufOffset = tuSize - 1;
-    pixel buf0[3 * MAX_CU_SIZE];
-    pixel buf1[3 * MAX_CU_SIZE];
-    pixel* above;
-    pixel* left = buf0 + bufOffset;
-
-    int limit = (dirMode <= 25 && dirMode >= 11) ? (tuSize + 1 + 1) : (tuSize2 + 1);
-
-    left[0] = src[0];
-    for (int k = 1; k < limit; k++)
-        left[k] = src[k + tuSize2];
+    pixel* srcBuf = intraNeighbourBuf[0];
 
     if (chFmt == X265_CSP_I444 && (g_intraFilterFlags[dirMode] & tuSize))
     {
-        // generate filtered intra prediction samples
-        buf0[bufOffset - 1] = src[1];
-        left = buf1 + bufOffset;
-        for (int i = 0; i < tuSize2; i++)
-            left[i] = (buf0[bufOffset + i - 1] + 2 * buf0[bufOffset + i] + buf0[bufOffset + i + 1] + 2) >> 2;
-        left[tuSize2] = buf0[bufOffset + tuSize2];
+        pixel* fltBuf = intraNeighbourBuf[1];
+        pixel topLeft = srcBuf[0], topLast = srcBuf[tuSize2], leftLast = srcBuf[tuSize2 + tuSize2];
 
-        above = buf0 + bufOffset;
-        above[0] = left[0];
+        // filtering top
         for (int i = 1; i < tuSize2; i++)
-            above[i] = (src[i - 1] + 2 * src[i] + src[i + 1] + 2) >> 2;
-        above[tuSize2] = src[tuSize2];
-    }
-    else
-    {
-        above = buf1 + bufOffset;
-        memcpy(above, src, (tuSize2 + 1) * sizeof(pixel));
+            fltBuf[i] = ((srcBuf[i] << 1) + srcBuf[i - 1] + srcBuf[i + 1] + 2) >> 2;
+        fltBuf[tuSize2] = topLast;
+
+        // filtering top-left
+        fltBuf[0] = ((srcBuf[0] << 1) + srcBuf[1] + srcBuf[tuSize2 + 1] + 2) >> 2;
+
+        //filtering left
+        fltBuf[tuSize2 + 1] = ((srcBuf[tuSize2 + 1] << 1) + topLeft + srcBuf[tuSize2 + 2] + 2) >> 2;
+        for (int i = tuSize2 + 2; i < tuSize2 + tuSize2; i++)
+            fltBuf[i] = ((srcBuf[i] << 1) + srcBuf[i - 1] + srcBuf[i + 1] + 2) >> 2;
+        fltBuf[tuSize2 + tuSize2] = leftLast;
+
+        srcBuf = intraNeighbourBuf[1];
     }
 
     int sizeIdx = log2TrSizeC - 2;
     X265_CHECK(sizeIdx >= 0 && sizeIdx < 4, "intra block size is out of range\n");
-    primitives.intra_pred[dirMode][sizeIdx](dst, stride, left, above, dirMode, 0);
+//    primitives.intra_pred[dirMode][sizeIdx](dst, stride, left, above, dirMode, 0);
+    primitives.intra_pred_new[dirMode][sizeIdx](dst, stride, srcBuf, dirMode, 0);
 }
 
 void Predict::initMotionCompensation(const CUData& cu, const CUGeom& cuGeom, int partIdx)
@@ -651,37 +615,22 @@
 
 void Predict::initAdiPattern(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, const IntraNeighbors& intraNeighbors, int dirMode)
 {
-    pixel* adiBuf      = m_predBuf;
-    pixel* refAbove    = m_refAbove;
-    pixel* refLeft     = m_refLeft;
-    pixel* refAboveFlt = m_refAboveFlt;
-    pixel* refLeftFlt  = m_refLeftFlt;
-
     int tuSize = intraNeighbors.tuSize;
     int tuSize2 = tuSize << 1;
 
     pixel* adiOrigin = cu.m_encData->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
     intptr_t picStride = cu.m_encData->m_reconPic->m_stride;
 
-    fillReferenceSamples(adiOrigin, picStride, adiBuf, intraNeighbors);
+    fillReferenceSamples(adiOrigin, picStride, intraNeighbors, intraNeighbourBuf[0]);
 
-    // initialization of ADI buffers
-    const int bufOffset = tuSize - 1;
-    refAbove += bufOffset;
-    refLeft += bufOffset;
+    pixel* refBuf = intraNeighbourBuf[0];
+    pixel* fltBuf = intraNeighbourBuf[1];
 
-    memcpy(refAbove, adiBuf, (tuSize2 + 1) * sizeof(pixel));
-
-    refLeft[0] = adiBuf[0];
-    for (int k = 1; k < tuSize2 + 1 ; k++)
-        refLeft[k] = adiBuf[k + tuSize2];
+    pixel topLeft = refBuf[0], topLast = refBuf[tuSize2], leftLast = refBuf[tuSize2 + tuSize2];
 
     if (dirMode == ALL_IDX ? (8 | 16 | 32) & tuSize : g_intraFilterFlags[dirMode] & tuSize)
     {
         // generate filtered intra prediction samples
-        refAboveFlt += bufOffset;
-        refLeftFlt += bufOffset;
-
         bool bStrongSmoothing = (tuSize == 32 && cu.m_slice->m_sps->bUseStrongIntraSmoothing);
 
         if (bStrongSmoothing)
@@ -689,56 +638,57 @@
             const int trSize = 32;
             const int trSize2 = trSize << 1;
             const int threshold = 1 << (X265_DEPTH - 5);
-            int refBL = refLeft[trSize2];
-            int refTL = refAbove[0];
-            int refTR = refAbove[trSize2];
-            bStrongSmoothing = (abs(refBL + refTL - (refLeft[trSize] << 1)) < threshold &&
-                abs(refTL + refTR - (refAbove[trSize] << 1)) < threshold);
+
+            pixel topMiddle = refBuf[32], leftMiddle = refBuf[tuSize2 + 32];
+
+            bStrongSmoothing = abs (topLeft + topLast - (topMiddle << 1)) < threshold &&
+                               abs (topLeft + leftLast - (leftMiddle << 1)) < threshold;
 
             if (bStrongSmoothing)
             {
                 // bilinear interpolation
-                const int shift = 5 + 1; // log2TrSize + 1;
-                int init = (refTL << shift) + tuSize;
+                const int shift = 5 + 1;
+                int init = (topLeft << shift) + tuSize;
                 int deltaL, deltaR;
 
-                refLeftFlt[0] = refAboveFlt[0] = refAbove[0];
+                //TODO: Performance Primitive???
+                deltaL = leftLast - topLeft; deltaR = topLast - topLeft;
 
-                //TODO: Performance Primitive???
-                deltaL = refBL - refTL; deltaR = refTR - refTL;
+                fltBuf[0] = topLeft;
                 for (int i = 1; i < trSize2; i++)
                 {
-                    refLeftFlt[i] = (pixel)((init + deltaL * i) >> shift);
-                    refAboveFlt[i] = (pixel)((init + deltaR * i) >> shift);
+                    fltBuf[i + tuSize2] = (pixel)((init + deltaL * i) >> shift); //Left Filtering
+                    fltBuf[i] = (pixel)((init + deltaR * i) >> shift); //Above Filtering
                 }
-                refLeftFlt[trSize2] = refLeft[trSize2];
-                refAboveFlt[trSize2] = refAbove[trSize2];
+                fltBuf[trSize2] = topLast;
+                fltBuf[tuSize2 + trSize2] = leftLast;
 
                 return;
             }
         }
 
-        refLeftFlt[0] = (refAbove[1] + (refLeft[0] << 1) + refLeft[1] + 2) >> 2;
+        // filtering top
         for (int i = 1; i < tuSize2; i++)
-            refLeftFlt[i] = (refLeft[i - 1] + (refLeft[i] << 1) + refLeft[i + 1] + 2) >> 2;
-        refLeftFlt[tuSize2] = refLeft[tuSize2];
+            fltBuf[i] = ((refBuf[i] << 1) + refBuf[i - 1] + refBuf[i + 1] + 2) >> 2;
+        fltBuf[tuSize2] = topLast;
 
-        refAboveFlt[0] = refLeftFlt[0];
-        for (int i = 1; i < tuSize2; i++)
-            refAboveFlt[i] = (refAbove[i - 1] + (refAbove[i] << 1) + refAbove[i + 1] + 2) >> 2;
-        refAboveFlt[tuSize2] = refAbove[tuSize2];
+        // filtering top-left
+        fltBuf[0] = ((topLeft << 1) + refBuf[1] + refBuf[tuSize2 + 1] + 2) >> 2;
+
+        //filtering left
+        fltBuf[tuSize2 + 1] = ((refBuf[tuSize2 + 1] << 1) + topLeft + refBuf[tuSize2 + 2] + 2) >> 2;
+        for (int i = tuSize2 + 2; i < tuSize2 + tuSize2; i++)
+            fltBuf[i] = ((refBuf[i] << 1) + refBuf[i - 1] + refBuf[i + 1] + 2) >> 2;
+        fltBuf[tuSize2 + tuSize2] = leftLast;
     }
 }
 
 void Predict::initAdiPatternChroma(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, const IntraNeighbors& intraNeighbors, uint32_t chromaId)
 {
-    uint32_t tuSize = intraNeighbors.tuSize;
-
     const pixel* adiOrigin = cu.m_encData->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx);
     intptr_t picStride = cu.m_encData->m_reconPic->m_strideC;
-    pixel* adiRef = getAdiChromaBuf(chromaId, tuSize);
 
-    fillReferenceSamples(adiOrigin, picStride, adiRef, intraNeighbors);
+    fillReferenceSamples(adiOrigin, picStride, intraNeighbors, intraNeighbourBuf[0]);
 }
 
 void Predict::initIntraNeighbors(const CUData& cu, uint32_t absPartIdx, uint32_t tuDepth, bool isLuma, IntraNeighbors *intraNeighbors)
@@ -797,7 +747,7 @@
     intraNeighbors->tuSize = tuSize;
 }
 
-void Predict::fillReferenceSamples(const pixel* adiOrigin, intptr_t picStride, pixel* adiRef, const IntraNeighbors& intraNeighbors)
+void Predict::fillReferenceSamples(const pixel* adiOrigin, intptr_t picStride, const IntraNeighbors& intraNeighbors, pixel dst[258])
 {
     const pixel dcValue = (pixel)(1 << (X265_DEPTH - 1));
     int numIntraNeighbor = intraNeighbors.numIntraNeighbor;
@@ -808,39 +758,39 @@
     // Nothing is available, perform DC prediction.
     if (numIntraNeighbor == 0)
     {
-        // Fill border with DC value
+        // Fill top border with DC value
         for (uint32_t i = 0; i < refSize; i++)
-            adiRef[i] = dcValue;
+            dst[i] = dcValue;
 
+        // Fill left border with DC value
         for (uint32_t i = 0; i < refSize - 1; i++)
-            adiRef[i + refSize] = dcValue;
+            dst[i + refSize] = dcValue;
     }
     else if (numIntraNeighbor == totalUnits)
     {
         // Fill top border with rec. samples
         const pixel* adiTemp = adiOrigin - picStride - 1;
-        memcpy(adiRef, adiTemp, refSize * sizeof(*adiRef));
+        memcpy(dst, adiTemp, refSize * sizeof(pixel));
 
         // Fill left border with rec. samples
         adiTemp = adiOrigin - 1;
-
         for (uint32_t i = 0; i < refSize - 1; i++)
         {
-            adiRef[i + refSize] = adiTemp[0];
+            dst[i + refSize] = adiTemp[0];
             adiTemp += picStride;
         }
     }
     else // reference samples are partially available
     {
-        const bool* bNeighborFlags = intraNeighbors.bNeighborFlags;
-        const bool* pNeighborFlags;
+        const bool *bNeighborFlags = intraNeighbors.bNeighborFlags;
+        const bool *pNeighborFlags;
         int aboveUnits = intraNeighbors.aboveUnits;
         int leftUnits = intraNeighbors.leftUnits;
         int unitWidth = intraNeighbors.unitWidth;
         int unitHeight = intraNeighbors.unitHeight;
         int totalSamples = (leftUnits * unitHeight) + ((aboveUnits + 1) * unitWidth);
         pixel adiLineBuffer[5 * MAX_CU_SIZE];
-        pixel* adi;
+        pixel *adi;
 
         // Initialize
         for (int i = 0; i < totalSamples; i++)
@@ -943,11 +893,11 @@
 
         // Copy processed samples
         adi = adiLineBuffer + refSize + unitWidth - 2;
-        memcpy(adiRef, adi, refSize * sizeof(*adiRef));
+        memcpy(dst, adi, refSize * sizeof(pixel));
 
         adi = adiLineBuffer + refSize - 1;
         for (int i = 0; i < (int)refSize - 1; i++)
-            adiRef[i + refSize] = adi[-(i + 1)];
+            dst[i + refSize] = adi[-(i + 1)];
     }
 }
 
diff -r 9ec15ed0fe2a -r cf29bf782449 source/common/predict.h
--- a/source/common/predict.h	Mon Jan 12 14:49:22 2015 +0530
+++ b/source/common/predict.h	Mon Jan 12 20:37:35 2015 +0530
@@ -63,13 +63,8 @@
     ShortYuv  m_predShortYuv[2]; /* temporary storage for weighted prediction */
     int16_t*  m_immedVals;
 
-    /* Intra prediction buffers */
-    pixel*    m_predBuf;
-    pixel*    m_refAbove;
-    pixel*    m_refAboveFlt;
-    pixel*    m_refLeft;
-    pixel*    m_refLeftFlt;
-
+    // Unfiltered/filtered neighbours of the current partition.
+    pixel     intraNeighbourBuf[2][258];
     /* Slice information */
     const Slice* m_predSlice;
     int       m_csp;
@@ -105,8 +100,7 @@
 
     /* Intra prediction helper functions */
     static void initIntraNeighbors(const CUData& cu, uint32_t absPartIdx, uint32_t tuDepth, bool isLuma, IntraNeighbors *IntraNeighbors);
-    static void fillReferenceSamples(const pixel* adiOrigin, intptr_t picStride, pixel* adiRef, const IntraNeighbors& intraNeighbors);
-
+    static void fillReferenceSamples(const pixel* adiOrigin, intptr_t picStride, const IntraNeighbors& intraNeighbors, pixel dst[258]);
     template<bool cip>
     static bool isAboveLeftAvailable(const CUData& cu, uint32_t partIdxLT);
     template<bool cip>
@@ -127,14 +121,9 @@
 
     /* Angular Intra */
     void predIntraLumaAng(uint32_t dirMode, pixel* pred, intptr_t stride, uint32_t log2TrSize);
-    void predIntraChromaAng(pixel* src, uint32_t dirMode, pixel* pred, intptr_t stride, uint32_t log2TrSizeC, int chFmt);
-
+    void predIntraChromaAng(uint32_t dirMode, pixel* pred, intptr_t stride, uint32_t log2TrSizeC, int chFmt);
     void initAdiPattern(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, const IntraNeighbors& intraNeighbors, int dirMode);
     void initAdiPatternChroma(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, const IntraNeighbors& intraNeighbors, uint32_t chromaId);
-    pixel* getAdiChromaBuf(uint32_t chromaId, int tuSize)
-    {
-        return m_predBuf + (chromaId == 1 ? 0 : 2 * ADI_BUF_STRIDE * (tuSize * 2 + 1));
-    }
 };
 }
 
diff -r 9ec15ed0fe2a -r cf29bf782449 source/common/primitives.h
--- a/source/common/primitives.h	Mon Jan 12 14:49:22 2015 +0530
+++ b/source/common/primitives.h	Mon Jan 12 20:37:35 2015 +0530
@@ -121,7 +121,9 @@
 typedef void (*blockfill_s_t)(int16_t* dst, intptr_t dstride, int16_t val);
 
 typedef void (*intra_pred_t)(pixel* dst, intptr_t dstStride, pixel* refLeft, pixel* refAbove, int dirMode, int bFilter);
+typedef void (*intra_pred_new_t)(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter);
 typedef void (*intra_allangs_t)(pixel* dst, pixel* above0, pixel* left0, pixel* above1, pixel* left1, int bLuma);
+typedef void (*intra_allangs_new_t)(pixel *dst, pixel *refPix, pixel *filtPix, int bLuma);
 
 typedef void (*cpy2Dto1D_shl_t)(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
 typedef void (*cpy2Dto1D_shr_t)(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
@@ -250,8 +252,11 @@
     denoiseDct_t          denoiseDct;
 
     intra_pred_t          intra_pred[NUM_INTRA_MODE][NUM_TR_SIZE]; /* todo: move to CU */
+    intra_pred_new_t      intra_pred_new[NUM_INTRA_MODE][NUM_TR_SIZE];
     intra_allangs_t       intra_pred_allangs[NUM_TR_SIZE];         /* todo: move to CU */
+    intra_allangs_new_t   intra_pred_allangs_new[NUM_TR_SIZE];
     scale_t               scale1D_128to64;
+    scale_t               scale1D_128to64_new;
     scale_t               scale2D_64to32;
 
     ssim_4x4x2_core_t     ssim_4x4x2_core;
diff -r 9ec15ed0fe2a -r cf29bf782449 source/encoder/search.cpp
--- a/source/encoder/search.cpp	Mon Jan 12 14:49:22 2015 +0530
+++ b/source/encoder/search.cpp	Mon Jan 12 20:37:35 2015 +0530
@@ -777,7 +777,6 @@
 
     ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
     uint32_t qtLayer = log2TrSize - 2;
-    uint32_t tuSize = 1 << log2TrSizeC;
     uint32_t stride = mode.fencYuv->m_csize;
     const uint32_t sizeIdxC = log2TrSizeC - 2;
     uint32_t outDist = 0;
@@ -815,11 +814,9 @@
 
             // init availability pattern
             initAdiPatternChroma(cu, cuGeom, absPartIdxC, intraNeighbors, chromaId);
-            pixel* chromaPred = getAdiChromaBuf(chromaId, tuSize);
 
             // get prediction signal
-            predIntraChromaAng(chromaPred, chromaPredMode, pred, stride, log2TrSizeC, m_csp);
-
+            predIntraChromaAng(chromaPredMode, pred, stride, log2TrSizeC, m_csp);
             cu.setTransformSkipPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep);
 
             primitives.cu[sizeIdxC].calcresidual(fenc, pred, residual, stride);
@@ -863,7 +860,6 @@
     uint32_t fullDepth = cu.m_cuDepth[0] + tuDepth;
     uint32_t log2TrSize = g_maxLog2CUSize - fullDepth;
     const uint32_t log2TrSizeC = 2;
-    uint32_t tuSize = 4;
     uint32_t qtLayer = log2TrSize - 2;
     uint32_t outDist = 0;
 
@@ -903,7 +899,6 @@
 
             // init availability pattern
             initAdiPatternChroma(cu, cuGeom, absPartIdxC, intraNeighbors, chromaId);
-            pixel* chromaPred = getAdiChromaBuf(chromaId, tuSize);
 
             uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC];
             if (chromaPredMode == DM_CHROMA_IDX)
@@ -912,7 +907,7 @@
                 chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode];
 
             // get prediction signal
-            predIntraChromaAng(chromaPred, chromaPredMode, pred, stride, log2TrSizeC, m_csp);
+            predIntraChromaAng(chromaPredMode, pred, stride, log2TrSizeC, m_csp);
 
             uint64_t bCost = MAX_INT64;
             uint32_t bDist = 0;
@@ -1076,7 +1071,6 @@
     }
 
     ShortYuv& resiYuv = m_rqt[cuGeom.depth].tmpResiYuv;
-    uint32_t tuSize = 1 << log2TrSizeC;
     uint32_t stride = mode.fencYuv->m_csize;
     const uint32_t sizeIdxC = log2TrSizeC - 2;
 
@@ -1111,10 +1105,9 @@
 
             // init availability pattern
             initAdiPatternChroma(cu, cuGeom, absPartIdxC, intraNeighbors, chromaId);
-            pixel* chromaPred = getAdiChromaBuf(chromaId, tuSize);
 
             // get prediction signal
-            predIntraChromaAng(chromaPred, chromaPredMode, pred, stride, log2TrSizeC, m_csp);
+            predIntraChromaAng(chromaPredMode, pred, stride, log2TrSizeC, m_csp);
 
             X265_CHECK(!cu.m_transformSkip[ttype][0], "transform skip not supported at low RD levels\n");
 
@@ -1208,10 +1201,6 @@
     const pixel* fenc = intraMode.fencYuv->m_buf[0];
     uint32_t stride = intraMode.fencYuv->m_size;
 
-    pixel* above = m_refAbove + tuSize - 1;
-    pixel* aboveFiltered = m_refAboveFlt + tuSize - 1;
-    pixel* left = m_refLeft + tuSize - 1;
-    pixel* leftFiltered = m_refLeftFlt + tuSize - 1;
     int sad, bsad;
     uint32_t bits, bbits, mode, bmode;
     uint64_t cost, bcost;
@@ -1231,26 +1220,23 @@
         primitives.scale2D_64to32(bufScale, fenc, stride);
         fenc = bufScale;
 
-        // reserve space in case primitives need to store data in above
-        // or left buffers
-        pixel _above[4 * 32 + 1];
-        pixel _left[4 * 32 + 1];
-        pixel* aboveScale = _above + 2 * 32;
-        pixel* leftScale = _left + 2 * 32;
-        aboveScale[0] = leftScale[0] = above[0];
-        primitives.scale1D_128to64(aboveScale + 1, above + 1, 0);
-        primitives.scale1D_128to64(leftScale + 1, left + 1, 0);
+        pixel nScale[129];
+        intraNeighbourBuf[1][0] = intraNeighbourBuf[0][0];
+        primitives.scale1D_128to64_new(nScale + 1, intraNeighbourBuf[0] + 1, 0);
+
+        //TO DO: primitive
+        for (int x = 1; x < 65; x++)
+        {
+            intraNeighbourBuf[0][x] = nScale[x];           // Top pixel
+            intraNeighbourBuf[0][x + 64] = nScale[x + 64]; // Left pixel
+            intraNeighbourBuf[1][x] = nScale[x];           // Top pixel
+            intraNeighbourBuf[1][x + 64] = nScale[x + 64]; // Left pixel
+        }
 
         scaleTuSize = 32;
         scaleStride = 32;
         costShift = 2;
         sizeIdx = 5 - 2; // log2(scaleTuSize) - 2
-
-        // Filtered and Unfiltered refAbove and refLeft pointing to above and left.
-        above = aboveScale;
-        left = leftScale;
-        aboveFiltered = aboveScale;
-        leftFiltered = leftScale;
     }
 
     pixelcmp_t sa8d = primitives.cu[sizeIdx].sa8d;
@@ -1267,23 +1253,20 @@
     uint32_t rbits = getIntraRemModeBits(cu, absPartIdx, preds, mpms);
 
     // DC
-    primitives.intra_pred[DC_IDX][sizeIdx](tmp, scaleStride, left, above, 0, (scaleTuSize <= 16));
+//    primitives.intra_pred[DC_IDX][sizeIdx](tmp, scaleStride, left, above, 0, (scaleTuSize <= 16));
+    primitives.intra_pred_new[DC_IDX][sizeIdx](tmp, scaleStride, intraNeighbourBuf[0], 0, (scaleTuSize <= 16));
     bsad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
     bmode = mode = DC_IDX;
     bbits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits;
     bcost = m_rdCost.calcRdSADCost(bsad, bbits);
 
-    pixel* abovePlanar = above;
-    pixel* leftPlanar = left;
-
+    // PLANAR
+    pixel* planar = intraNeighbourBuf[0];
     if (tuSize & (8 | 16 | 32))
-    {
-        abovePlanar = aboveFiltered;
-        leftPlanar = leftFiltered;
-    }
-
-    // PLANAR
-    primitives.intra_pred[PLANAR_IDX][sizeIdx](tmp, scaleStride, leftPlanar, abovePlanar, 0, 0);
+        planar = intraNeighbourBuf[1];
+
+//    primitives.intra_pred[PLANAR_IDX][sizeIdx](tmp, scaleStride, leftPlanar, abovePlanar, 0, 0);
+    primitives.intra_pred_new[PLANAR_IDX][sizeIdx](tmp, scaleStride, planar, 0, 0);
     sad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
     mode = PLANAR_IDX;
     bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits;
@@ -1294,7 +1277,8 @@
     if (primitives.intra_pred_allangs[sizeIdx])
     {
         primitives.cu[sizeIdx].transpose(bufTrans, fenc, scaleStride);
-        primitives.intra_pred_allangs[sizeIdx](tmp, above, left, aboveFiltered, leftFiltered, (scaleTuSize <= 16));
+//        primitives.intra_pred_allangs[sizeIdx](tmp, above, left, aboveFiltered, leftFiltered, (scaleTuSize <= 16));
+        primitives.intra_pred_allangs_new[sizeIdx](tmp, intraNeighbourBuf[0], intraNeighbourBuf[1], (scaleTuSize <= 16)); 
     }
     else
         allangs = false;
@@ -1309,9 +1293,9 @@
         cost = m_rdCost.calcRdSADCost(sad, bits); \
     } else { \
         if (g_intraFilterFlags[angle] & scaleTuSize) \
-            primitives.intra_pred[angle][sizeIdx](tmp, scaleTuSize, leftFiltered, aboveFiltered, angle, scaleTuSize <= 16); \
+            primitives.intra_pred_new[angle][sizeIdx](tmp, scaleTuSize, intraNeighbourBuf[1], angle, scaleTuSize <= 16); \
         else \
-            primitives.intra_pred[angle][sizeIdx](tmp, scaleTuSize, left, above, angle, scaleTuSize <= 16); \
+            primitives.intra_pred_new[angle][sizeIdx](tmp, scaleTuSize, intraNeighbourBuf[0], angle, scaleTuSize <= 16); \
         sad = sa8d(fenc, scaleStride, tmp, scaleTuSize) << costShift; \
         bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(preds, angle) : rbits; \
         cost = m_rdCost.calcRdSADCost(sad, bits); \
@@ -1453,46 +1437,38 @@
             const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
             uint32_t stride = predYuv->m_size;
 
-            pixel* above = m_refAbove + tuSize - 1;
-            pixel* aboveFiltered = m_refAboveFlt + tuSize - 1;
-            pixel* left = m_refLeft + tuSize - 1;
-            pixel* leftFiltered = m_refLeftFlt + tuSize - 1;
-
             // 33 Angle modes once
             ALIGN_VAR_32(pixel, buf_trans[32 * 32]);
             ALIGN_VAR_32(pixel, tmp[33 * 32 * 32]);
-            ALIGN_VAR_32(pixel, bufScale[32 * 32]);
-            pixel _above[4 * 32 + 1];
-            pixel _left[4 * 32 + 1];
+
             int scaleTuSize = tuSize;
             int scaleStride = stride;
             int costShift = 0;
 
             if (tuSize > 32)
             {
-                pixel* aboveScale = _above + 2 * 32;
-                pixel* leftScale = _left + 2 * 32;
-
                 // origin is 64x64, we scale to 32x32 and setup required parameters
+                ALIGN_VAR_32(pixel, bufScale[32 * 32]);
                 primitives.scale2D_64to32(bufScale, fenc, stride);
                 fenc = bufScale;
 
-                // reserve space in case primitives need to store data in above
-                // or left buffers
-                aboveScale[0] = leftScale[0] = above[0];
-                primitives.scale1D_128to64(aboveScale + 1, above + 1, 0);
-                primitives.scale1D_128to64(leftScale + 1, left + 1, 0);
+                pixel nScale[129];
+                intraNeighbourBuf[1][0] = intraNeighbourBuf[0][0];
+                primitives.scale1D_128to64_new(nScale + 1, intraNeighbourBuf[0] + 1, 0);
+
+                // TO DO: primitive
+                for (int x = 1; x < 65; x++)
+                {
+                    intraNeighbourBuf[0][x] = nScale[x];           // Top pixel
+                    intraNeighbourBuf[0][x + 64] = nScale[x + 64]; // Left pixel
+                    intraNeighbourBuf[1][x] = nScale[x];           // Top pixel
+                    intraNeighbourBuf[1][x + 64] = nScale[x + 64]; // Left pixel
+                }
 
                 scaleTuSize = 32;
                 scaleStride = 32;
                 costShift = 2;
                 sizeIdx = 5 - 2; // log2(scaleTuSize) - 2
-
-                // Filtered and Unfiltered refAbove and refLeft pointing to above and left.
-                above = aboveScale;
-                left = leftScale;
-                aboveFiltered = aboveScale;
-                leftFiltered = leftScale;
             }
 
             m_entropyCoder.loadIntraDirModeLuma(m_rqt[depth].cur);
@@ -1510,29 +1486,29 @@
             uint64_t bcost;
 
             // DC
-            primitives.intra_pred[DC_IDX][sizeIdx](tmp, scaleStride, left, above, 0, (scaleTuSize <= 16));
+//            primitives.intra_pred[DC_IDX][sizeIdx](tmp, scaleStride, left, above, 0, (scaleTuSize <= 16));
+            primitives.intra_pred_new[DC_IDX][sizeIdx](tmp, scaleStride, intraNeighbourBuf[0], 0, (scaleTuSize <= 16));
             uint32_t bits = (mpms & ((uint64_t)1 << DC_IDX)) ? m_entropyCoder.bitsIntraModeMPM(preds, DC_IDX) : rbits;
             uint32_t sad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
             modeCosts[DC_IDX] = bcost = m_rdCost.calcRdSADCost(sad, bits);
 
             // PLANAR
-            pixel* abovePlanar = above;
-            pixel* leftPlanar = left;
+            pixel* planar = intraNeighbourBuf[0];
             if (tuSize >= 8 && tuSize <= 32)
-            {
-                abovePlanar = aboveFiltered;
-                leftPlanar = leftFiltered;
-            }
-            primitives.intra_pred[PLANAR_IDX][sizeIdx](tmp, scaleStride, leftPlanar, abovePlanar, 0, 0);
+                planar = intraNeighbourBuf[1];
+
+//            primitives.intra_pred[PLANAR_IDX][sizeIdx](tmp, scaleStride, leftPlanar, abovePlanar, 0, 0);
+            primitives.intra_pred_new[PLANAR_IDX][sizeIdx](tmp, scaleStride, planar, 0, 0);
             bits = (mpms & ((uint64_t)1 << PLANAR_IDX)) ? m_entropyCoder.bitsIntraModeMPM(preds, PLANAR_IDX) : rbits;
             sad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift;
             modeCosts[PLANAR_IDX] = m_rdCost.calcRdSADCost(sad, bits);
             COPY1_IF_LT(bcost, modeCosts[PLANAR_IDX]);
 
             // angular predictions
-            if (primitives.intra_pred_allangs[sizeIdx])
+            if (primitives.intra_pred_allangs_new[sizeIdx])
             {
-                primitives.intra_pred_allangs[sizeIdx](tmp, above, left, aboveFiltered, leftFiltered, (scaleTuSize <= 16));
+//                primitives.intra_pred_allangs[sizeIdx](tmp, above, left, aboveFiltered, leftFiltered, (scaleTuSize <= 16));
+                primitives.intra_pred_allangs_new[sizeIdx](tmp, intraNeighbourBuf[0], intraNeighbourBuf[1], (scaleTuSize <= 16));
                 primitives.cu[sizeIdx].transpose(buf_trans, fenc, scaleStride);
                 for (int mode = 2; mode < 35; mode++)
                 {
@@ -1550,9 +1526,11 @@
                 for (int mode = 2; mode < 35; mode++)
                 {
                     if (g_intraFilterFlags[mode] & scaleTuSize)
-                        primitives.intra_pred[mode][sizeIdx](tmp, scaleTuSize, leftFiltered, aboveFiltered, mode, scaleTuSize <= 16);
+//                        primitives.intra_pred[mode][sizeIdx](tmp, scaleTuSize, leftFiltered, aboveFiltered, mode, scaleTuSize <= 16);
+                        primitives.intra_pred_new[mode][sizeIdx](tmp, scaleTuSize, intraNeighbourBuf[1], mode, scaleTuSize <= 16);
                     else
-                        primitives.intra_pred[mode][sizeIdx](tmp, scaleTuSize, left, above, mode, scaleTuSize <= 16);
+//                        primitives.intra_pred[mode][sizeIdx](tmp, scaleTuSize, left, above, mode, scaleTuSize <= 16);
+                        primitives.intra_pred_new[mode][sizeIdx](tmp, scaleTuSize, intraNeighbourBuf[0], mode, scaleTuSize <= 16);
                     bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits;
                     sad = sa8d(fenc, scaleStride, tmp, scaleTuSize) << costShift;
                     modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits);
@@ -1663,8 +1641,6 @@
 
     IntraNeighbors intraNeighbors;
     initIntraNeighbors(cu, 0, tuDepth, false, &intraNeighbors);
-    Predict::initAdiPatternChroma(cu, cuGeom, 0, intraNeighbors, 1); // U
-    Predict::initAdiPatternChroma(cu, cuGeom, 0, intraNeighbors, 2); // V
     cu.getAllowedChromaDir(0, modeList);
 
     // check chroma modes
@@ -1681,10 +1657,9 @@
         {
             const pixel* fenc = fencYuv->m_buf[chromaId];
             pixel* pred = predYuv->m_buf[chromaId];
-            pixel* chromaPred = getAdiChromaBuf(chromaId, scaleTuSize);
-
+            Predict::initAdiPatternChroma(cu, cuGeom, 0, intraNeighbors, chromaId);
             // get prediction signal
-            predIntraChromaAng(chromaPred, chromaPredMode, pred, fencYuv->m_csize, log2TrSizeC, m_csp);
+            predIntraChromaAng(chromaPredMode, pred, fencYuv->m_csize, log2TrSizeC, m_csp);
             cost += primitives.cu[log2TrSizeC - 2].sa8d(fenc, predYuv->m_csize, pred, fencYuv->m_csize) << costShift;
         }
 
diff -r 9ec15ed0fe2a -r cf29bf782449 source/encoder/slicetype.cpp
--- a/source/encoder/slicetype.cpp	Mon Jan 12 14:49:22 2015 +0530
+++ b/source/encoder/slicetype.cpp	Mon Jan 12 20:37:35 2015 +0530
@@ -1668,37 +1668,37 @@
     if (!fenc->bIntraCalculated)
     {
         const int sizeIdx = X265_LOWRES_CU_BITS - 2; // partition size
-
-        pixel _above0[X265_LOWRES_CU_SIZE * 4 + 1], *const above0 = _above0 + 2 * X265_LOWRES_CU_SIZE;
-        pixel _above1[X265_LOWRES_CU_SIZE * 4 + 1], *const above1 = _above1 + 2 * X265_LOWRES_CU_SIZE;
-        pixel _left0[X265_LOWRES_CU_SIZE * 4 + 1], *const left0 = _left0 + 2 * X265_LOWRES_CU_SIZE;
-        pixel _left1[X265_LOWRES_CU_SIZE * 4 + 1], *const left1 = _left1 + 2 * X265_LOWRES_CU_SIZE;
+        const int cuSize2 = cuSize << 1;
+        pixel neighbours[2][X265_LOWRES_CU_SIZE * 4 + 1];
 
         pixel *pix_cur = fenc->lowresPlane[0] + pelOffset;
 
         // Copy Above
-        memcpy(above0, pix_cur - 1 - fenc->lumaStride, (cuSize + 1) * sizeof(pixel));
+        memcpy(neighbours[0], pix_cur - 1 - fenc->lumaStride, (cuSize + 1) * sizeof(pixel));
 
         // Copy Left
-        for (int i = 0; i < cuSize + 1; i++)
-            left0[i] = pix_cur[-1 - fenc->lumaStride + i * fenc->lumaStride];
+        for (int i = 1; i < cuSize + 1; i++)
+            neighbours[0][i + cuSize2] = pix_cur[-1 - fenc->lumaStride + i * fenc->lumaStride];
 
         for (int i = 0; i < cuSize; i++)
         {
-            above0[cuSize + i + 1] = above0[cuSize];
-            left0[cuSize + i + 1] = left0[cuSize];
+            // Copy above-last pixel
+            neighbours[0][i + cuSize + 1] = neighbours[0][cuSize]; //neighbours[0][i + 9] = neighbours[0][8]
+            // Copy left-last pixel
+            neighbours[0][i + cuSize2 + cuSize + 1] = neighbours[0][cuSize2 + cuSize]; //neighbours[0][i + 25] = neighbours[0][24]
         }
 
-        // filtering with [1 2 1]
-        // assume getUseStrongIntraSmoothing() is disabled
-        above1[0] = above0[0];
-        above1[2 * cuSize] = above0[2 * cuSize];
-        left1[0] = left0[0];
-        left1[2 * cuSize] = left0[2 * cuSize];
-        for (int i = 1; i < 2 * cuSize; i++)
+        // Filter neighbour pixels with [1-2-1]
+        neighbours[1][0]  = neighbours[0][0];  // Copy top-left pixel 
+        neighbours[1][cuSize2] = neighbours[0][cuSize2]; //Copy top-right pixel
+        neighbours[1][cuSize2 << 1] = neighbours[0][cuSize2 << 1]; // Bottom-left pixel
+
+        neighbours[1][1]           = (neighbours[0][0] + (neighbours[0][1] << 1)           + neighbours[0][2] + 2)               >> 2;
+        neighbours[1][cuSize2 + 1] = (neighbours[0][0] + (neighbours[0][cuSize2 + 1] << 1) + neighbours[0][cuSize2 + 1 + 1] + 2) >> 2;
+        for (int i = 2; i < cuSize2; i++)
         {
-            above1[i] = (above0[i - 1] + 2 * above0[i] + above0[i + 1] + 2) >> 2;
-            left1[i] = (left0[i - 1] + 2 * left0[i] + left0[i + 1] + 2) >> 2;
+            neighbours[1][i]           = (neighbours[0][i - 1]      + (neighbours[0][i] << 1)      + neighbours[0][i + 1]      + 2) >> 2;
+            neighbours[1][cuSize2 + i] = (neighbours[0][cuSize2 + i - 1] + (neighbours[0][cuSize2 + i] << 1) + neighbours[0][cuSize2 + i + 1] + 2) >> 2;
         }
 
         int predsize = cuSize * cuSize;
@@ -1706,23 +1706,25 @@
         // generate 35 intra predictions into m_predictions
         pixelcmp_t satd = primitives.pu[partitionFromLog2Size(X265_LOWRES_CU_BITS)].satd;
         int icost = m_me.COST_MAX;
-        primitives.intra_pred[DC_IDX][sizeIdx](m_predictions, cuSize, left0, above0, 0, (cuSize <= 16));
+//        primitives.intra_pred[DC_IDX][sizeIdx](m_predictions, cuSize, left0, above0, 0, (cuSize <= 16));
+        primitives.intra_pred_new[DC_IDX][sizeIdx](m_predictions, cuSize, neighbours[0], 0, (cuSize <= 16));
         int cost = m_me.bufSATD(m_predictions, cuSize);
         if (cost < icost)
             icost = cost;
-        pixel *above = (cuSize >= 8) ? above1 : above0;
-        pixel *left  = (cuSize >= 8) ? left1 : left0;
-        primitives.intra_pred[PLANAR_IDX][sizeIdx](m_predictions, cuSize, left, above, 0, 0);
+
+        pixel *planar = (cuSize >= 8) ? neighbours[1] : neighbours[0];
+        primitives.intra_pred_new[PLANAR_IDX][sizeIdx](m_predictions, cuSize, planar, 0, 0);
         cost = m_me.bufSATD(m_predictions, cuSize);
         if (cost < icost)
             icost = cost;
 
         uint32_t mode, lowmode = 4;
-        if (primitives.intra_pred_allangs[sizeIdx])
+        if (primitives.intra_pred_allangs_new[sizeIdx])
         {
             ALIGN_VAR_32(pixel, buf_trans[32 * 32]);
 
-            primitives.intra_pred_allangs[sizeIdx](m_predictions + 2 * predsize, above0, left0, above1, left1, (cuSize <= 16));
+//            primitives.intra_pred_allangs[sizeIdx](m_predictions + 2 * predsize, above0, left0, above1, left1, (cuSize <= 16));
+            primitives.intra_pred_allangs_new[sizeIdx](m_predictions + 2 * predsize, neighbours[0], neighbours[1], (cuSize <= 16));
             primitives.cu[sizeIdx].transpose(buf_trans, m_me.fencPUYuv.m_buf[0], FENC_STRIDE);
 
             int acost = m_me.COST_MAX;
@@ -1762,9 +1764,11 @@
             for (mode = 5; mode < 35; mode += 5)
             {
                 if (g_intraFilterFlags[mode] & cuSize)
-                    primitives.intra_pred[mode][sizeIdx](m_predictions, cuSize, left1, above1, mode, cuSize <= 16);
+//                    primitives.intra_pred[mode][sizeIdx](m_predictions, cuSize, left1, above1, mode, cuSize <= 16);
+                    primitives.intra_pred_new[mode][sizeIdx](m_predictions, cuSize, neighbours[1], mode, cuSize <= 16);
                 else
-                    primitives.intra_pred[mode][sizeIdx](m_predictions, cuSize, left0, above0, mode, cuSize <= 16);
+//                    primitives.intra_pred[mode][sizeIdx](m_predictions, cuSize, left0, above0, mode, cuSize <= 16);
+                    primitives.intra_pred_new[mode][sizeIdx](m_predictions, cuSize, neighbours[0], mode, cuSize <= 16);
                 cost = m_me.bufSATD(m_predictions, cuSize);
                 COPY2_IF_LT(acost, cost, lowmode, mode);
             }
@@ -1775,17 +1779,21 @@
 
                 mode = minusmode;
                 if (g_intraFilterFlags[mode] & cuSize)
-                    primitives.intra_pred[mode][sizeIdx](m_predictions, cuSize, left1, above1, mode, cuSize <= 16);
+//                   primitives.intra_pred[mode][sizeIdx](m_predictions, cuSize, left1, above1, mode, cuSize <= 16);
+                    primitives.intra_pred_new[mode][sizeIdx](m_predictions, cuSize, neighbours[1], mode, cuSize <= 16);
                 else
-                    primitives.intra_pred[mode][sizeIdx](m_predictions, cuSize, left0, above0, mode, cuSize <= 16);
+//                    primitives.intra_pred[mode][sizeIdx](m_predictions, cuSize, left0, above0, mode, cuSize <= 16);
+                    primitives.intra_pred_new[mode][sizeIdx](m_predictions, cuSize, neighbours[0], mode, cuSize <= 16);
                 cost = m_me.bufSATD(m_predictions, cuSize);
                 COPY2_IF_LT(acost, cost, lowmode, mode);
 
                 mode = plusmode;
                 if (g_intraFilterFlags[mode] & cuSize)
-                    primitives.intra_pred[mode][sizeIdx](m_predictions, cuSize, left1, above1, mode, cuSize <= 16);
+//                    primitives.intra_pred[mode][sizeIdx](m_predictions, cuSize, left1, above1, mode, cuSize <= 16);
+                    primitives.intra_pred_new[mode][sizeIdx](m_predictions, cuSize, neighbours[1], mode, cuSize <= 16);
                 else
-                    primitives.intra_pred[mode][sizeIdx](m_predictions, cuSize, left0, above0, mode, cuSize <= 16);
+//                    primitives.intra_pred[mode][sizeIdx](m_predictions, cuSize, left0, above0, mode, cuSize <= 16);
+                    primitives.intra_pred_new[mode][sizeIdx](m_predictions, cuSize, neighbours[0], mode, cuSize <= 16);
                 cost = m_me.bufSATD(m_predictions, cuSize);
                 COPY2_IF_LT(acost, cost, lowmode, mode);
             }


More information about the x265-devel mailing list