[x265] [PATCH 1 of 5] intra: pull the simple 1:2:1 pixel filtering into a performance primitive

Steve Borho steve at borho.org
Fri Feb 27 21:05:17 CET 2015


# HG changeset patch
# User Steve Borho <steve at borho.org>
# Date 1425064240 21600
#      Fri Feb 27 13:10:40 2015 -0600
# Node ID 651e1baa3ecb02e75a08983d5cb8f824371887b6
# Parent  018e8bbaa854b1a4bd82b3a2e23f7775a77da5cc
intra: pull the simple 1:2:1 pixel filtering into a performance primitive

Only C-refs at this point, but at least it is templated so the compiler can
optimize and unroll loops cleanly.

As a side effect, this quiets the gcc 4.8 warning about loop bounds and
aggressive loop optimizations in slicetype.cpp

diff -r 018e8bbaa854 -r 651e1baa3ecb source/common/intrapred.cpp
--- a/source/common/intrapred.cpp	Fri Feb 27 11:46:09 2015 +0530
+++ b/source/common/intrapred.cpp	Fri Feb 27 13:10:40 2015 -0600
@@ -27,6 +27,29 @@
 using namespace x265;
 
 namespace {
+
+template<int tuSize>
+void intraFilter(const pixel* samples, pixel* filtered) /* 1:2:1 filtering of left and top reference samples */
+{
+    const int tuSize2 = tuSize << 1;
+
+    pixel topLeft = samples[0], topLast = samples[tuSize2], leftLast = samples[tuSize2 + tuSize2];
+
+    // filtering top
+    for (int i = 1; i < tuSize2; i++)
+        filtered[i] = ((samples[i] << 1) + samples[i - 1] + samples[i + 1] + 2) >> 2;
+    filtered[tuSize2] = topLast;
+    
+    // filtering top-left
+    filtered[0] = ((topLeft << 1) + samples[1] + samples[tuSize2 + 1] + 2) >> 2;
+
+    // filtering left
+    filtered[tuSize2 + 1] = ((samples[tuSize2 + 1] << 1) + topLeft + samples[tuSize2 + 2] + 2) >> 2;
+    for (int i = tuSize2 + 2; i < tuSize2 + tuSize2; i++)
+        filtered[i] = ((samples[i] << 1) + samples[i - 1] + samples[i + 1] + 2) >> 2;
+    filtered[tuSize2 + tuSize2] = leftLast;
+}
+
 void dcPredFilter(const pixel* above, const pixel* left, pixel* dst, intptr_t dststride, int size)
 {
     // boundary pixels processing
@@ -216,6 +239,11 @@
 
 void setupIntraPrimitives_c(EncoderPrimitives& p)
 {
+    p.cu[BLOCK_4x4].intra_filter = intraFilter<4>;
+    p.cu[BLOCK_8x8].intra_filter = intraFilter<8>;
+    p.cu[BLOCK_16x16].intra_filter = intraFilter<16>;
+    p.cu[BLOCK_32x32].intra_filter = intraFilter<32>;
+
     p.cu[BLOCK_4x4].intra_pred[PLANAR_IDX] = planar_pred_c<2>;
     p.cu[BLOCK_8x8].intra_pred[PLANAR_IDX] = planar_pred_c<3>;
     p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = planar_pred_c<4>;
diff -r 018e8bbaa854 -r 651e1baa3ecb source/common/predict.cpp
--- a/source/common/predict.cpp	Fri Feb 27 11:46:09 2015 +0530
+++ b/source/common/predict.cpp	Fri Feb 27 13:10:40 2015 -0600
@@ -93,34 +93,17 @@
 void Predict::predIntraChromaAng(uint32_t dirMode, pixel* dst, intptr_t stride, uint32_t log2TrSizeC, int chFmt)
 {
     int tuSize = 1 << log2TrSizeC;
-    int tuSize2 = tuSize << 1;
+    int sizeIdx = log2TrSizeC - 2;
+    X265_CHECK(sizeIdx >= 0 && sizeIdx < 4, "intra block size is out of range\n");
 
     pixel* srcBuf = intraNeighbourBuf[0];
 
     if (chFmt == X265_CSP_I444 && (g_intraFilterFlags[dirMode] & tuSize))
     {
-        pixel* fltBuf = intraNeighbourBuf[1];
-        pixel topLeft = srcBuf[0], topLast = srcBuf[tuSize2], leftLast = srcBuf[tuSize2 + tuSize2];
-
-        // filtering top
-        for (int i = 1; i < tuSize2; i++)
-            fltBuf[i] = ((srcBuf[i] << 1) + srcBuf[i - 1] + srcBuf[i + 1] + 2) >> 2;
-        fltBuf[tuSize2] = topLast;
-
-        // filtering top-left
-        fltBuf[0] = ((srcBuf[0] << 1) + srcBuf[1] + srcBuf[tuSize2 + 1] + 2) >> 2;
-
-        // filtering left
-        fltBuf[tuSize2 + 1] = ((srcBuf[tuSize2 + 1] << 1) + topLeft + srcBuf[tuSize2 + 2] + 2) >> 2;
-        for (int i = tuSize2 + 2; i < tuSize2 + tuSize2; i++)
-            fltBuf[i] = ((srcBuf[i] << 1) + srcBuf[i - 1] + srcBuf[i + 1] + 2) >> 2;
-        fltBuf[tuSize2 + tuSize2] = leftLast;
-
+        primitives.cu[sizeIdx].intra_filter(intraNeighbourBuf[0], intraNeighbourBuf[1]);
         srcBuf = intraNeighbourBuf[1];
     }
 
-    int sizeIdx = log2TrSizeC - 2;
-    X265_CHECK(sizeIdx >= 0 && sizeIdx < 4, "intra block size is out of range\n");
     primitives.cu[sizeIdx].intra_pred[dirMode](dst, stride, srcBuf, dirMode, 0);
 }
 
@@ -626,12 +609,12 @@
     }
 }
 
-void Predict::initAdiPattern(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, const IntraNeighbors& intraNeighbors, int dirMode)
+void Predict::initAdiPattern(const CUData& cu, const CUGeom& cuGeom, uint32_t puAbsPartIdx, const IntraNeighbors& intraNeighbors, int dirMode)
 {
-    int tuSize = intraNeighbors.tuSize;
+    int tuSize = 1 << intraNeighbors.log2TrSize;
     int tuSize2 = tuSize << 1;
 
-    pixel* adiOrigin = cu.m_encData->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
+    pixel* adiOrigin = cu.m_encData->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + puAbsPartIdx);
     intptr_t picStride = cu.m_encData->m_reconPic->m_stride;
 
     fillReferenceSamples(adiOrigin, picStride, intraNeighbors, intraNeighbourBuf[0]);
@@ -673,25 +656,13 @@
             }
         }
 
-        // filtering top
-        for (int i = 1; i < tuSize2; i++)
-            fltBuf[i] = ((refBuf[i] << 1) + refBuf[i - 1] + refBuf[i + 1] + 2) >> 2;
-        fltBuf[tuSize2] = topLast;
-
-        // filtering top-left
-        fltBuf[0] = ((topLeft << 1) + refBuf[1] + refBuf[tuSize2 + 1] + 2) >> 2;
-
-        // filtering left
-        fltBuf[tuSize2 + 1] = ((refBuf[tuSize2 + 1] << 1) + topLeft + refBuf[tuSize2 + 2] + 2) >> 2;
-        for (int i = tuSize2 + 2; i < tuSize2 + tuSize2; i++)
-            fltBuf[i] = ((refBuf[i] << 1) + refBuf[i - 1] + refBuf[i + 1] + 2) >> 2;
-        fltBuf[tuSize2 + tuSize2] = leftLast;
+        primitives.cu[intraNeighbors.log2TrSize - 2].intra_filter(refBuf, fltBuf);
     }
 }
 
-void Predict::initAdiPatternChroma(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, const IntraNeighbors& intraNeighbors, uint32_t chromaId)
+void Predict::initAdiPatternChroma(const CUData& cu, const CUGeom& cuGeom, uint32_t puAbsPartIdx, const IntraNeighbors& intraNeighbors, uint32_t chromaId)
 {
-    const pixel* adiOrigin = cu.m_encData->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
+    const pixel* adiOrigin = cu.m_encData->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + puAbsPartIdx);
     intptr_t picStride = cu.m_encData->m_reconPic->m_strideC;
 
     fillReferenceSamples(adiOrigin, picStride, intraNeighbors, intraNeighbourBuf[0]);
@@ -750,7 +721,7 @@
     intraNeighbors->leftUnits = leftUnits;
     intraNeighbors->unitWidth = 1 << log2UnitWidth;
     intraNeighbors->unitHeight = 1 << log2UnitHeight;
-    intraNeighbors->tuSize = tuSize;
+    intraNeighbors->log2TrSize = log2TrSize;
 }
 
 void Predict::fillReferenceSamples(const pixel* adiOrigin, intptr_t picStride, const IntraNeighbors& intraNeighbors, pixel dst[258])
@@ -758,7 +729,7 @@
     const pixel dcValue = (pixel)(1 << (X265_DEPTH - 1));
     int numIntraNeighbor = intraNeighbors.numIntraNeighbor;
     int totalUnits = intraNeighbors.totalUnits;
-    uint32_t tuSize = intraNeighbors.tuSize;
+    uint32_t tuSize = 1 << intraNeighbors.log2TrSize;
     uint32_t refSize = tuSize * 2 + 1;
 
     // Nothing is available, perform DC prediction.
diff -r 018e8bbaa854 -r 651e1baa3ecb source/common/predict.h
--- a/source/common/predict.h	Fri Feb 27 11:46:09 2015 +0530
+++ b/source/common/predict.h	Fri Feb 27 13:10:40 2015 -0600
@@ -67,7 +67,7 @@
         int      leftUnits;
         int      unitWidth;
         int      unitHeight;
-        int      tuSize;
+        int      log2TrSize;
         bool     bNeighborFlags[4 * MAX_NUM_SPU_W + 1];
     };
 
@@ -102,8 +102,8 @@
     /* Angular Intra */
     void predIntraLumaAng(uint32_t dirMode, pixel* pred, intptr_t stride, uint32_t log2TrSize);
     void predIntraChromaAng(uint32_t dirMode, pixel* pred, intptr_t stride, uint32_t log2TrSizeC, int chFmt);
-    void initAdiPattern(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, const IntraNeighbors& intraNeighbors, int dirMode);
-    void initAdiPatternChroma(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, const IntraNeighbors& intraNeighbors, uint32_t chromaId);
+    void initAdiPattern(const CUData& cu, const CUGeom& cuGeom, uint32_t puAbsPartIdx, const IntraNeighbors& intraNeighbors, int dirMode);
+    void initAdiPatternChroma(const CUData& cu, const CUGeom& cuGeom, uint32_t puAbsPartIdx, const IntraNeighbors& intraNeighbors, uint32_t chromaId);
 
     /* Intra prediction helper functions */
     static void initIntraNeighbors(const CUData& cu, uint32_t absPartIdx, uint32_t tuDepth, bool isLuma, IntraNeighbors *IntraNeighbors);
diff -r 018e8bbaa854 -r 651e1baa3ecb source/common/primitives.h
--- a/source/common/primitives.h	Fri Feb 27 11:46:09 2015 +0530
+++ b/source/common/primitives.h	Fri Feb 27 13:10:40 2015 -0600
@@ -119,6 +119,7 @@
 
 typedef void (*intra_pred_t)(pixel* dst, intptr_t dstStride, const pixel *srcPix, int dirMode, int bFilter);
 typedef void (*intra_allangs_t)(pixel *dst, pixel *refPix, pixel *filtPix, int bLuma);
+typedef void (*intra_filter_t)(const pixel* references, pixel* filtered);
 
 typedef void (*cpy2Dto1D_shl_t)(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
 typedef void (*cpy2Dto1D_shr_t)(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
@@ -247,6 +248,7 @@
 
         transpose_t     transpose;     // transpose pixel block; for use with intra all-angs
         intra_allangs_t intra_pred_allangs;
+        intra_filter_t  intra_filter;
         intra_pred_t    intra_pred[NUM_INTRA_MODE];
     }
     cu[NUM_CU_SIZES];
diff -r 018e8bbaa854 -r 651e1baa3ecb source/encoder/slicetype.cpp
--- a/source/encoder/slicetype.cpp	Fri Feb 27 11:46:09 2015 +0530
+++ b/source/encoder/slicetype.cpp	Fri Feb 27 13:10:40 2015 -0600
@@ -212,6 +212,7 @@
     ALIGN_VAR_32(pixel, prediction[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]);
     pixel fencIntra[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE];
     pixel neighbours[2][X265_LOWRES_CU_SIZE * 4 + 1];
+    pixel* samples = neighbours[0], *filtered = neighbours[1];
 
     const int lookAheadLambda = (int)x265_lambda_tab[X265_LOOKAHEAD_QP];
     const int intraPenalty = 5 * lookAheadLambda;
@@ -221,8 +222,8 @@
     const int cuSize2 = cuSize << 1;
     const int sizeIdx = X265_LOWRES_CU_BITS - 2;
 
-    pixel *planar = (cuSize >= 8) ? neighbours[1] : neighbours[0];
     pixelcmp_t satd = primitives.pu[sizeIdx].satd;
+    int planar = !!(cuSize >= 8);
 
     fenc.costEst[0][0] = 0;
     fenc.costEstAq[0][0] = 0;
@@ -235,43 +236,33 @@
         {
             const int cuXY = cuX + cuY * widthInCU;
             const intptr_t pelOffset = cuSize * cuX + cuSize * cuY * fenc.lumaStride;
+            pixel *pixCur = fenc.lowresPlane[0] + pelOffset;
 
-            /* Prep reference pixels */
-            pixel *pixCur = fenc.lowresPlane[0] + pelOffset;
+            /* copy fenc pixels */
             primitives.cu[sizeIdx].copy_pp(fencIntra, cuSize, pixCur, fenc.lumaStride);
 
-            memcpy(neighbours[0], pixCur - 1 - fenc.lumaStride, (cuSize + 1) * sizeof(pixel));
+            /* collect reference sample pixels */
+            memcpy(samples, pixCur - 1 - fenc.lumaStride, (cuSize + 1) * sizeof(pixel));
             for (int i = 1; i < cuSize + 1; i++)
-                neighbours[0][i + cuSize2] = pixCur[(i - 1) * fenc.lumaStride - 1];
+                samples[i + cuSize2] = pixCur[(i - 1) * fenc.lumaStride - 1];
 
             for (int i = 0; i < cuSize; i++)
             {
-                neighbours[0][i + cuSize + 1] = neighbours[0][cuSize];                     // Copy above-last pixel
-                neighbours[0][i + cuSize2 + cuSize + 1] = neighbours[0][cuSize2 + cuSize]; // Copy left-last pixel
+                samples[i + cuSize + 1] = samples[cuSize];                     // Copy above-last pixel
+                samples[i + cuSize2 + cuSize + 1] = samples[cuSize2 + cuSize]; // Copy left-last pixel
             }
 
-            neighbours[1][0]  = neighbours[0][0];                      // Copy top-left pixel 
-            neighbours[1][cuSize2] = neighbours[0][cuSize2];           // Copy top-right pixel
-            neighbours[1][cuSize2 << 1] = neighbours[0][cuSize2 << 1]; // Bottom-left pixel
-
-            // Filter neighbour pixels with [1-2-1]
-            neighbours[1][1]           = (neighbours[0][0] + (neighbours[0][1] << 1)           + neighbours[0][2] + 2)               >> 2;
-            neighbours[1][cuSize2 + 1] = (neighbours[0][0] + (neighbours[0][cuSize2 + 1] << 1) + neighbours[0][cuSize2 + 1 + 1] + 2) >> 2;
-            for (int i = 2; i < cuSize2; i++)
-            {
-                neighbours[1][i]           = (neighbours[0][i - 1]           + (neighbours[0][i] << 1)           + neighbours[0][i + 1]      + 2) >> 2;
-                neighbours[1][cuSize2 + i] = (neighbours[0][cuSize2 + i - 1] + (neighbours[0][cuSize2 + i] << 1) + neighbours[0][cuSize2 + i + 1] + 2) >> 2;
-            }
+            primitives.cu[sizeIdx].intra_filter(samples, filtered);
 
             int cost, icost = me.COST_MAX;
             uint32_t ilowmode = 0;
 
             /* DC and planar */
-            primitives.cu[sizeIdx].intra_pred[DC_IDX](prediction, cuSize, neighbours[0], 0, cuSize <= 16);
+            primitives.cu[sizeIdx].intra_pred[DC_IDX](prediction, cuSize, samples, 0, cuSize <= 16);
             cost = satd(fencIntra, cuSize, prediction, cuSize);
             COPY2_IF_LT(icost, cost, ilowmode, DC_IDX);
 
-            primitives.cu[sizeIdx].intra_pred[PLANAR_IDX](prediction, cuSize, planar, 0, 0);
+            primitives.cu[sizeIdx].intra_pred[PLANAR_IDX](prediction, cuSize, neighbours[planar], 0, 0);
             cost = satd(fencIntra, cuSize, prediction, cuSize);
             COPY2_IF_LT(icost, cost, ilowmode, PLANAR_IDX);
 


More information about the x265-devel mailing list