[x265] simplify intra filter (with fix for da61cf406f16) (Re: primitives: intra_pred[4][35] => intra_pred[35][4] (avoid *35))

Satoshi Nakagawa nakagawa424 at oki.com
Mon Sep 22 14:34:01 CEST 2014


# HG changeset patch
# User Satoshi Nakagawa <nakagawa424 at oki.com>
# Date 1411388939 -32400
#      Mon Sep 22 21:28:59 2014 +0900
# Node ID 3f229951f826e1d09dd0258721ef5a1f9fdc4392
# Parent  fd435504f15e0b13dabba9efe0aa94e7047060b5
simplify intra filter (with fix for da61cf406f16)

diff -r fd435504f15e -r 3f229951f826 source/Lib/TLibCommon/TComPattern.cpp
--- a/source/Lib/TLibCommon/TComPattern.cpp	Mon Sep 22 13:14:54 2014 +0530
+++ b/source/Lib/TLibCommon/TComPattern.cpp	Mon Sep 22 21:28:59 2014 +0900
@@ -52,133 +52,96 @@
 void TComPattern::initAdiPattern(TComDataCU* cu, uint32_t zOrderIdxInPart, uint32_t partDepth, pixel* adiBuf,
                                  pixel* refAbove, pixel* refLeft, pixel* refAboveFlt, pixel* refLeftFlt, int dirMode)
 {
-    pixel* roiOrigin;
-    pixel* adiTemp;
-
-    int picStride = cu->m_pic->getStride();
-
     IntraNeighbors intraNeighbors;
 
     initIntraNeighbors(cu, zOrderIdxInPart, partDepth, true, &intraNeighbors);
     uint32_t tuSize = intraNeighbors.tuSize;
     uint32_t tuSize2 = tuSize << 1;
 
-    roiOrigin = cu->m_pic->getPicYuvRec()->getLumaAddr(cu->getAddr(), cu->getZorderIdxInCU() + zOrderIdxInPart);
-    adiTemp   = adiBuf;
+    pixel* adiOrigin = cu->m_pic->getPicYuvRec()->getLumaAddr(cu->getAddr(), cu->getZorderIdxInCU() + zOrderIdxInPart);
+    int picStride = cu->m_pic->getStride();
 
-    fillReferenceSamples(roiOrigin, picStride, adiTemp, intraNeighbors);
+    fillReferenceSamples(adiOrigin, picStride, adiBuf, intraNeighbors);
 
+    // initialization of ADI buffers
+    const int bufOffset = tuSize - 1;
+    refAbove += bufOffset;
+    refLeft += bufOffset;
+
+    //  ADI_BUF_STRIDE * (2 * tuSize + 1);
+    memcpy(refAbove, adiBuf, (tuSize2 + 1) * sizeof(pixel));
+    for (int k = 0; k < tuSize2 + 1; k++)
+        refLeft[k] = adiBuf[k * ADI_BUF_STRIDE];
+    
     bool bUseFilteredPredictions = (dirMode == ALL_IDX ? (8 | 16 | 32) & tuSize : g_intraFilterFlags[dirMode] & tuSize);
 
     if (bUseFilteredPredictions)
     {
         // generate filtered intra prediction samples
-        // left and left above border + above and above right border + top left corner = length of 3. filter buffer
-        int bufSize = tuSize2 + tuSize2 + 1;
-        uint32_t wh = ADI_BUF_STRIDE * (tuSize2 + 1);         // number of elements in one buffer
+        refAboveFlt += bufOffset;
+        refLeftFlt += bufOffset;
 
-        pixel* filterBuf  = adiBuf + wh;         // buffer for 2. filtering (sequential)
-        pixel* filterBufN = filterBuf + bufSize; // buffer for 1. filtering (sequential)
+        bool bStrongSmoothing = (tuSize == 32 && cu->m_slice->m_sps->bUseStrongIntraSmoothing);
 
-        int l = 0;
-        // left border from bottom to top
-        for (int i = 0; i < tuSize2; i++)
+        if (bStrongSmoothing)
         {
-            filterBuf[l++] = adiTemp[ADI_BUF_STRIDE * (tuSize2 - i)];
-        }
+            const int trSize  = 32;
+            const int trSize2 = 32 * 2;
+            const int threshold = 1 << (X265_DEPTH - 5);
+            int refBL = refLeft[trSize2];
+            int refTL = refAbove[0];
+            int refTR = refAbove[trSize2];
+            bStrongSmoothing = (abs(refBL + refTL - 2 * refLeft[trSize])  < threshold &&
+                                abs(refTL + refTR - 2 * refAbove[trSize]) < threshold);
 
-        // top left corner
-        filterBuf[l++] = adiTemp[0];
+            if (bStrongSmoothing)
+            {
+                // bilinear interpolation
+                const int shift = 5 + 1; // intraNeighbors.log2TrSize + 1;
+                int init = (refTL << shift) + tuSize;
+                int delta;
 
-        // above border from left to right
-        memcpy(&filterBuf[l], &adiTemp[1], tuSize2 * sizeof(*filterBuf));
+                refLeftFlt[0] = refAboveFlt[0] = refAbove[0];
 
-        if (tuSize >= 32 && cu->m_slice->m_sps->bUseStrongIntraSmoothing)
-        {
-            int bottomLeft = filterBuf[0];
-            int topLeft = filterBuf[tuSize2];
-            int topRight = filterBuf[bufSize - 1];
-            int threshold = 1 << (X265_DEPTH - 5);
-            bool bilinearLeft = abs(bottomLeft + topLeft - 2 * filterBuf[tuSize]) < threshold;
-            bool bilinearAbove  = abs(topLeft + topRight - 2 * filterBuf[tuSize2 + tuSize]) < threshold;
+                //TODO: Performance Primitive???
+                delta = refBL - refTL;
+                for (int i = 1; i < trSize2; i++)
+                    refLeftFlt[i] = (init + delta * i) >> shift;
+                refLeftFlt[trSize2] = refLeft[trSize2];
 
-            if (bilinearLeft && bilinearAbove)
-            {
-                int shift = intraNeighbors.log2TrSize + 1;
-                filterBufN[0] = filterBuf[0];
-                filterBufN[tuSize2] = filterBuf[tuSize2];
-                filterBufN[bufSize - 1] = filterBuf[bufSize - 1];
-                //TODO: Performance Primitive???
-                for (int i = 1; i < tuSize2; i++)
-                {
-                    filterBufN[i] = ((tuSize2 - i) * bottomLeft + i * topLeft + tuSize) >> shift;
-                }
+                delta = refTR - refTL;
+                for (int i = 1; i < trSize2; i++)
+                    refAboveFlt[i] = (init + delta * i) >> shift;
+                refAboveFlt[trSize2] = refAbove[trSize2];
 
-                for (int i = 1; i < tuSize2; i++)
-                {
-                    filterBufN[tuSize2 + i] = ((tuSize2 - i) * topLeft + i * topRight + tuSize) >> shift;
-                }
-            }
-            else
-            {
-                // 1. filtering with [1 2 1]
-                filterBufN[0] = filterBuf[0];
-                filterBufN[bufSize - 1] = filterBuf[bufSize - 1];
-                for (int i = 1; i < bufSize - 1; i++)
-                {
-                    filterBufN[i] = (filterBuf[i - 1] + 2 * filterBuf[i] + filterBuf[i + 1] + 2) >> 2;
-                }
-            }
-        }
-        else
-        {
-            // 1. filtering with [1 2 1]
-            filterBufN[0] = filterBuf[0];
-            filterBufN[bufSize - 1] = filterBuf[bufSize - 1];
-            for (int i = 1; i < bufSize - 1; i++)
-            {
-                filterBufN[i] = (filterBuf[i - 1] + 2 * filterBuf[i] + filterBuf[i + 1] + 2) >> 2;
+                return;
             }
         }
 
-        // initialization of ADI buffers
-        refAboveFlt += tuSize - 1;
-        refLeftFlt += tuSize - 1;
-        memcpy(refAboveFlt, filterBufN + tuSize2, (tuSize2 + 1) * sizeof(pixel));
-        for (int k = 0; k < tuSize2 + 1; k++)
-        {
-            refLeftFlt[k] = filterBufN[tuSize2 - k];   // Smoothened
-        }
-    }
+        refLeft[-1] = refAbove[1];
+        for (int i = 0; i < tuSize2; i++)
+            refLeftFlt[i] = (refLeft[i - 1] + 2 * refLeft[i] + refLeft[i + 1] + 2) >> 2;
+        refLeftFlt[tuSize2] = refLeft[tuSize2];
 
-    // initialization of ADI buffers
-    refAbove += tuSize - 1;
-    refLeft += tuSize - 1;
-
-    //  ADI_BUF_STRIDE * (2 * tuSize + 1);
-    memcpy(refAbove, adiBuf, (tuSize2 + 1) * sizeof(pixel));
-    for (int k = 0; k < tuSize2 + 1; k++)
-    {
-        refLeft[k] = adiBuf[k * ADI_BUF_STRIDE];
+        refAboveFlt[0] = refLeftFlt[0];
+        for (int i = 1; i < tuSize2; i++)
+            refAboveFlt[i] = (refAbove[i - 1] + 2 * refAbove[i] + refAbove[i + 1] + 2) >> 2;
+        refAboveFlt[tuSize2] = refAbove[tuSize2];
     }
 }
 
 void TComPattern::initAdiPatternChroma(TComDataCU* cu, uint32_t zOrderIdxInPart, uint32_t partDepth, pixel* adiBuf, uint32_t chromaId)
 {
-    pixel*  roiOrigin;
-    pixel*  adiTemp;
-
-    int picStride = cu->m_pic->getCStride();
-
     IntraNeighbors intraNeighbors;
 
     initIntraNeighbors(cu, zOrderIdxInPart, partDepth, false, &intraNeighbors);
     uint32_t tuSize = intraNeighbors.tuSize;
 
-    roiOrigin = cu->m_pic->getPicYuvRec()->getChromaAddr(chromaId, cu->getAddr(), cu->getZorderIdxInCU() + zOrderIdxInPart);
-    adiTemp   = getAdiChromaBuf(chromaId, tuSize, adiBuf);
+    pixel* adiOrigin = cu->m_pic->getPicYuvRec()->getChromaAddr(chromaId, cu->getAddr(), cu->getZorderIdxInCU() + zOrderIdxInPart);
+    int picStride = cu->m_pic->getCStride();
+    pixel* adiRef = getAdiChromaBuf(chromaId, tuSize, adiBuf);
 
-    fillReferenceSamples(roiOrigin, picStride, adiTemp, intraNeighbors);
+    fillReferenceSamples(adiOrigin, picStride, adiRef, intraNeighbors);
 }
 
 void TComPattern::initIntraNeighbors(TComDataCU* cu, uint32_t zOrderIdxInPart, uint32_t partDepth, bool isLuma, IntraNeighbors *intraNeighbors)
@@ -226,14 +189,13 @@
     intraNeighbors->log2TrSize       = log2TrSize;
 }
 
-void TComPattern::fillReferenceSamples(pixel* roiOrigin, int picStride, pixel* adiTemp, const IntraNeighbors& intraNeighbors)
+void TComPattern::fillReferenceSamples(pixel* adiOrigin, int picStride, pixel* adiRef, const IntraNeighbors& intraNeighbors)
 {
     int numIntraNeighbor = intraNeighbors.numIntraNeighbor;
     int totalUnits       = intraNeighbors.totalUnits;
     uint32_t tuSize      = intraNeighbors.tuSize;
 
     uint32_t refSize = tuSize * 2 + 1;
-    pixel* roiTemp;
     int  i, j;
     int  dcValue = 1 << (X265_DEPTH - 1);
 
@@ -241,27 +203,23 @@
     {
         // Fill border with DC value
         for (i = 0; i < refSize; i++)
-        {
-            adiTemp[i] = dcValue;
-        }
+            adiRef[i] = dcValue;
 
         for (i = 1; i < refSize; i++)
-        {
-            adiTemp[i * ADI_BUF_STRIDE] = dcValue;
-        }
+            adiRef[i * ADI_BUF_STRIDE] = dcValue;
     }
     else if (numIntraNeighbor == totalUnits)
     {
         // Fill top border with rec. samples
-        roiTemp = roiOrigin - picStride - 1;
-        memcpy(adiTemp, roiTemp, refSize * sizeof(*adiTemp));
+        pixel* adiTemp = adiOrigin - picStride - 1;
+        memcpy(adiRef, adiTemp, refSize * sizeof(*adiRef));
 
         // Fill left border with rec. samples
-        roiTemp = roiOrigin - 1;
+        adiTemp = adiOrigin - 1;
         for (i = 1; i < refSize; i++)
         {
-            adiTemp[i * ADI_BUF_STRIDE] = roiTemp[0];
-            roiTemp += picStride;
+            adiRef[i * ADI_BUF_STRIDE] = adiTemp[0];
+            adiTemp += picStride;
         }
     }
     else // reference samples are partially available
@@ -284,12 +242,12 @@
         }
 
         // Fill top-left sample
-        roiTemp = roiOrigin - picStride - 1;
+        pixel* adiTemp =  adiOrigin - picStride - 1;
         pAdiLineTemp = pAdiLine + (leftUnits * unitHeight);
         pNeighborFlags = bNeighborFlags + leftUnits;
         if (*pNeighborFlags)
         {
-            pixel topLeftVal = roiTemp[0];
+            pixel topLeftVal = adiTemp[0];
             for (i = 0; i < unitWidth; i++)
             {
                 pAdiLineTemp[i] = topLeftVal;
@@ -297,7 +255,7 @@
         }
 
         // Fill left & below-left samples
-        roiTemp += picStride;
+        adiTemp += picStride;
         pAdiLineTemp--;
         pNeighborFlags--;
         for (j = 0; j < leftUnits; j++)
@@ -306,25 +264,23 @@
             {
                 for (i = 0; i < unitHeight; i++)
                 {
-                    pAdiLineTemp[-i] = roiTemp[i * picStride];
+                    pAdiLineTemp[-i] = adiTemp[i * picStride];
                 }
             }
-            roiTemp += unitHeight * picStride;
+            adiTemp += unitHeight * picStride;
             pAdiLineTemp -= unitHeight;
             pNeighborFlags--;
         }
 
         // Fill above & above-right samples
-        roiTemp = roiOrigin - picStride;
+        adiTemp = adiOrigin - picStride;
         pAdiLineTemp = pAdiLine + (leftUnits * unitHeight) + unitWidth;
         pNeighborFlags = bNeighborFlags + leftUnits + 1;
         for (j = 0; j < aboveUnits; j++)
         {
             if (*pNeighborFlags)
-            {
-                memcpy(pAdiLineTemp, roiTemp, unitWidth * sizeof(*adiTemp));
-            }
-            roiTemp += unitWidth;
+                memcpy(pAdiLineTemp, adiTemp, unitWidth * sizeof(*adiTemp));
+            adiTemp += unitWidth;
             pAdiLineTemp += unitWidth;
             pNeighborFlags++;
         }
@@ -395,12 +351,12 @@
 
         // Copy processed samples
         pAdiLineTemp = pAdiLine + refSize + unitWidth - 2;
-        memcpy(adiTemp, pAdiLineTemp, refSize * sizeof(*adiTemp));
+        memcpy(adiRef, pAdiLineTemp, refSize * sizeof(*adiRef));
 
         pAdiLineTemp = pAdiLine + refSize - 1;
         for (i = 1; i < refSize; i++)
         {
-            adiTemp[i * ADI_BUF_STRIDE] = pAdiLineTemp[-i];
+            adiRef[i * ADI_BUF_STRIDE] = pAdiLineTemp[-i];
         }
     }
 }
diff -r fd435504f15e -r 3f229951f826 source/Lib/TLibCommon/TComPattern.h
--- a/source/Lib/TLibCommon/TComPattern.h	Mon Sep 22 13:14:54 2014 +0530
+++ b/source/Lib/TLibCommon/TComPattern.h	Mon Sep 22 21:28:59 2014 +0900
@@ -95,7 +95,7 @@
 private:
 
     /// padding of unavailable reference samples for intra prediction
-    static void fillReferenceSamples(pixel* roiOrigin, int picStride, pixel* adiTemp, const IntraNeighbors& intraNeighbors);
+    static void fillReferenceSamples(pixel* adiOrigin, int picStride, pixel* adiRef, const IntraNeighbors& intraNeighbors);
 
     /// constrained intra prediction
     static bool  isAboveLeftAvailable(TComDataCU* cu, uint32_t partIdxLT);
diff -r fd435504f15e -r 3f229951f826 source/encoder/predict.cpp
--- a/source/encoder/predict.cpp	Mon Sep 22 13:14:54 2014 +0530
+++ b/source/encoder/predict.cpp	Mon Sep 22 21:28:59 2014 +0900
@@ -106,58 +106,45 @@
 void Predict::predIntraChromaAng(pixel* src, uint32_t dirMode, pixel* dst, intptr_t stride, uint32_t log2TrSizeC, int chFmt)
 {
     int tuSize = 1 << log2TrSizeC;
-    uint32_t tuSize2 = tuSize << 1;
+    int tuSize2 = tuSize << 1;
 
     // Create the prediction
-    pixel* refAbv;
-    pixel refLft[3 * MAX_CU_SIZE];
+    const int bufOffset = tuSize - 1;
+    pixel buf0[3 * MAX_CU_SIZE];
+    pixel buf1[3 * MAX_CU_SIZE];
+    pixel* above;
+    pixel* left = buf0 + bufOffset;
+
+    int limit = (dirMode <= 25 && dirMode >= 11) ? (tuSize + 1 + 1) : (tuSize2 + 1);
+    for (int k = 0; k < limit; k++)
+        left[k] = src[k * ADI_BUF_STRIDE];
 
     bool bUseFilteredPredictions = (chFmt == X265_CSP_I444 && (g_intraFilterFlags[dirMode] & tuSize));
 
     if (bUseFilteredPredictions)
     {
         // generate filtered intra prediction samples
-        // left and left above border + above and above right border + top left corner = length of 3. filter buffer
-        int bufSize = tuSize2 + tuSize2 + 1;
-        uint32_t wh = ADI_BUF_STRIDE * (tuSize2 + 1);         // number of elements in one buffer
+        buf0[bufOffset - 1] = src[1];
+        left = buf1 + bufOffset;
+        for (int i = 0; i < tuSize2; i++)
+            left[i] = (buf0[bufOffset + i - 1] + 2 * buf0[bufOffset + i] + buf0[bufOffset + i + 1] + 2) >> 2;
+        left[tuSize2] = buf0[bufOffset + tuSize2];
 
-        pixel* filterBuf  = src + wh;            // buffer for 2. filtering (sequential)
-        pixel* filterBufN = filterBuf + bufSize; // buffer for 1. filtering (sequential)
-
-        int l = 0;
-        // left border from bottom to top
-        for (uint32_t i = 0; i < tuSize2; i++)
-            filterBuf[l++] = src[ADI_BUF_STRIDE * (tuSize2 - i)];
-
-        // top left corner
-        filterBuf[l++] = src[0];
-
-        // above border from left to right
-        memcpy(&filterBuf[l], &src[1], tuSize2 * sizeof(*filterBuf));
-
-        // 1. filtering with [1 2 1]
-        filterBufN[0] = filterBuf[0];
-        filterBufN[bufSize - 1] = filterBuf[bufSize - 1];
-        for (int i = 1; i < bufSize - 1; i++)
-            filterBufN[i] = (filterBuf[i - 1] + 2 * filterBuf[i] + filterBuf[i + 1] + 2) >> 2;
-
-        // initialization of ADI buffers
-        int limit = tuSize2 + 1;
-        refAbv = filterBufN + tuSize2;
-        for (int k = 0; k < limit; k++)
-            refLft[k + tuSize - 1] = filterBufN[tuSize2 - k];   // Smoothened
+        above = buf0 + bufOffset;
+        above[0] = left[0];
+        for (int i = 1; i < tuSize2; i++)
+            above[i] = (src[i - 1] + 2 * src[i] + src[i + 1] + 2) >> 2;
+        above[tuSize2] = src[tuSize2];
     }
     else
     {
-        int limit = (dirMode <= 25 && dirMode >= 11) ? (tuSize + 1 + 1) : (tuSize2 + 1);
-        refAbv = src;
-        for (int k = 0; k < limit; k++)
-            refLft[k + tuSize - 1] = src[k * ADI_BUF_STRIDE];
+        above = buf1 + bufOffset;
+        memcpy(above, src, (tuSize2 + 1) * sizeof(pixel));
     }
 
     int sizeIdx = log2TrSizeC - 2;
     X265_CHECK(sizeIdx >= 0 && sizeIdx < 4, "intra block size is out of range\n");
-    primitives.intra_pred[dirMode][sizeIdx](dst, stride, refLft + tuSize - 1, refAbv, dirMode, 0);
+    primitives.intra_pred[dirMode][sizeIdx](dst, stride, left, above, dirMode, 0);
 }
 
 bool Predict::checkIdenticalMotion()


More information about the x265-devel mailing list