[x265] [PATCH]Improvement intra_pred_planar

Thu Jul 18 10:21:25 CEST 2013

# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1374133971 -28800
# Node ID a4013cdafef00502efe0d496dcb3c4f2bce966a4
# Parent  f813f110d69a1a6650e813dd4e612216982a0264
intrapred: improvement intra_pred_planar

diff -r f813f110d69a -r a4013cdafef0 source/Lib/TLibCommon/TComPrediction.cpp

--- a/source/Lib/TLibCommon/TComPrediction.cpp Thu Jul 18 02:10:37 2013 -0500
+++ b/source/Lib/TLibCommon/TComPrediction.cpp Thu Jul 18 15:52:51 2013 +0800
@@ -177,17 +177,16 @@
     }
 
     // get starting pixel in block
-    Int sw = ADI_BUF_STRIDE;
     Bool bFilter = (size <= 16);
 
     // Create the prediction
     if (dirMode == PLANAR_IDX)
     {
-        primitives.intra_pred_planar(src + sw + 1, sw, dst, stride, size);
+        primitives.intra_pred_planar((pixel*)refAbv + 1, (pixel*)refLft + 1, (pixel*)dst, stride, size);
     }
     else if (dirMode == DC_IDX)
     {
-        primitives.intra_pred_dc(refAbv + 1, refLft + 1, dst, stride, size, bFilter);
+        primitives.intra_pred_dc((pixel*)refAbv + 1, (pixel*)refLft + 1, (pixel*)dst, stride, size, bFilter);
     }
     else
     {
@@ -198,33 +197,28 @@
 // Angular chroma
 Void TComPrediction::predIntraChromaAng(Pel* src, UInt dirMode, Pel* dst, UInt stride, Int width)
 {
+    // Create the prediction
+    Pel refAbv[3 * MAX_CU_SIZE];
+    Pel refLft[3 * MAX_CU_SIZE];
+    int limit = (dirMode <= 25 && dirMode >= 11) ? (width + 1 + 1) : (2 * width + 1);
+    memcpy(refAbv + width - 1, src, (limit) * sizeof(Pel));
+    for (int k = 0; k < limit; k++)
+    {
+        refLft[k + width - 1] = src[k * ADI_BUF_STRIDE];
+    }
+
     // get starting pixel in block
-    Int sw = ADI_BUF_STRIDE;
-
     if (dirMode == PLANAR_IDX)
     {
-        primitives.intra_pred_planar(src + sw + 1, sw, dst, stride, width);
+        primitives.intra_pred_planar((pixel*)refAbv + width - 1 + 1, (pixel*)refLft + width - 1 + 1, (pixel*)dst, stride, width);
+    }
+    else if (dirMode == DC_IDX)
+    {
+        primitives.intra_pred_dc(refAbv + width - 1 + 1, refLft + width - 1 + 1, dst, stride, width, false);
     }
     else
     {
-        // Create the prediction
-        Pel refAbv[3 * MAX_CU_SIZE];
-        Pel refLft[3 * MAX_CU_SIZE];
-        int limit = (dirMode <= 25 && dirMode >= 11) ? (width + 1) : (2 * width + 1);
-        memcpy(refAbv + width - 1, src, (limit) * sizeof(Pel));
-        for (int k = 0; k < limit; k++)
-        {
-            refLft[k + width - 1] = src[k * sw];
-        }
-
-        if (dirMode == DC_IDX)
-        {
-            primitives.intra_pred_dc(refAbv + width - 1 + 1, refLft + width - 1 + 1, dst, stride, width, false);
-        }
-        else
-        {
-            primitives.intra_pred_ang(dst, stride, width, dirMode, false, refLft + width - 1, refAbv + width - 1);
-        }
+        primitives.intra_pred_ang(dst, stride, width, dirMode, false, refLft + width - 1, refAbv + width - 1);
     }
 }
 
diff -r f813f110d69a -r a4013cdafef0 source/Lib/TLibEncoder/TEncSearch.cpp
--- a/source/Lib/TLibEncoder/TEncSearch.cpp Thu Jul 18 02:10:37 2013 -0500
+++ b/source/Lib/TLibEncoder/TEncSearch.cpp Thu Jul 18 15:52:51 2013 +0800
@@ -1975,6 +1975,8 @@
             Pel *pAbove1 = refAboveFlt + width - 1;
             Pel *pLeft0  = refLeft     + width - 1;
             Pel *pLeft1  = refLeftFlt  + width - 1;
+            Pel *above   = pAbove0;
+            Pel *left    = pLeft0;
 
             // 1
             primitives.intra_pred_dc(pAbove0 + 1, pLeft0 + 1, pred, stride, width, bFilter);
@@ -1984,8 +1986,10 @@
             if (width >= 8 && width <= 32)
             {
                 predSrc += ADI_BUF_STRIDE * (2 * width + 1);
+                above = pAbove1;
+                left  = pLeft1;
             }
-            primitives.intra_pred_planar(predSrc + ADI_BUF_STRIDE + 1, ADI_BUF_STRIDE, pred, stride, width);
+            primitives.intra_pred_planar((pixel*)above + 1, (pixel*)left + 1, pred, stride, width);
             modeCosts[PLANAR_IDX] = sa8d(fenc, stride, pred, stride);
 
             // 33 Angle modes once
diff -r f813f110d69a -r a4013cdafef0 source/common/intrapred.cpp
--- a/source/common/intrapred.cpp Thu Jul 18 02:10:37 2013 -0500
+++ b/source/common/intrapred.cpp Thu Jul 18 15:52:51 2013 +0800
@@ -98,7 +98,7 @@
     }
 }
 
-void PredIntraPlanar(pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width)
+void PredIntraPlanar(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int width)
 {
     //assert(width == height);
 
@@ -117,8 +117,8 @@
     // Get left and above reference column and row
     for (k = 0; k < blkSize + 1; k++)
     {
-        topRow[k] = src[k - srcStride];
-        leftColumn[k] = src[k * srcStride - 1];
+        topRow[k] = above[k];
+        leftColumn[k] = left[k];
     }
 
     // Prepare intermediate variables used in interpolation
diff -r f813f110d69a -r a4013cdafef0 source/common/primitives.h
--- a/source/common/primitives.h Thu Jul 18 02:10:37 2013 -0500
+++ b/source/common/primitives.h Thu Jul 18 15:52:51 2013 +0800
@@ -195,7 +195,7 @@
 typedef void (*pixelsub_sp_t)(int bx, int by, short *dst, intptr_t dstride, pixel *src0, pixel *src1, intptr_t sstride0, intptr_t sstride1);
 
 typedef void (*intra_dc_t)(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int width, int bFilter);
-typedef void (*intra_planar_t)(pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width);
+typedef void (*intra_planar_t)(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int width);
 typedef void (*intra_ang_t)(pixel* dst, int dstStride, int width, int dirMode, bool bFilter, pixel *refLeft, pixel *refAbove);
 typedef void (*intra_allangs_t)(pixel *dst, pixel *above0, pixel *left0, pixel *above1, pixel *left1, bool bLuma);
 
diff -r f813f110d69a -r a4013cdafef0 source/common/vec/intrapred.inc
--- a/source/common/vec/intrapred.inc Thu Jul 18 02:10:37 2013 -0500
+++ b/source/common/vec/intrapred.inc Thu Jul 18 15:52:51 2013 +0800
@@ -746,27 +746,42 @@
 #endif // if HIGH_BIT_DEPTH
 }
 
+#if INSTRSET >= 4  // SSSE3
+    #define BROADCAST16(a, d, x) { \
+        const __m128i mask = _mm_set1_epi16( (((d) * 2) | ((d) * 2 + 1) << 8) ); \
+        (x) = _mm_shuffle_epi8((a), mask); \
+    }
+#else
+    #define BROADCAST16(a, d, x) { \
+        const int dL = (d) & 3; \
+        const int dH = ((d)-4) & 3; \
+        if (d>=4) { \
+            (x) = _mm_shufflehi_epi16((a), dH * 0x55); \
+            (x) = _mm_unpackhi_epi64((x), (x)); \
+        } \
+        else { \
+            (x) = _mm_shufflelo_epi16((a), dL * 0x55); \
+            (x) = _mm_unpacklo_epi64((x), (x)); \
+        } \
+    }
+#endif
+
+
 #if HIGH_BIT_DEPTH
-// CHECK_ME: I am not sure the v_rightColumnN will be overflow when input as 12bpp
-void intra_pred_planar4(pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride)
+// CHECK_ME: I am not sure the v_rightColumnN will be overflow when input is 12bpp
+void intra_pred_planar4(pixel* above, pixel* left, pixel* dst, intptr_t dstStride)
 {
-    int k, bottomLeft, topRight;
+    int bottomLeft, topRight;
     // NOTE: I use 16-bits is enough here, because we have least than 13-bits as input, and shift left by 2, it is 15-bits
-    int16_t leftColumn[4];
 
     // Get left and above reference column and row
-    Vec8s v_topRow = (Vec8s)load_partial(const_int(8), &src[-srcStride]); // topRow
-
-    for (k = 0; k < 4; k++)
-    {
-        leftColumn[k] = src[k * srcStride - 1];
-    }
-
-    Vec8s v_leftColumn = (Vec8s)load_partial(const_int(8), leftColumn);   // leftColumn
+    Vec8s v_topRow = (Vec8s)load_partial(const_int(8), above); // topRow
+
+    Vec8s v_leftColumn = (Vec8s)load_partial(const_int(8), left);   // leftColumn
 
     // Prepare intermediate variables used in interpolation
-    bottomLeft = src[4 * srcStride - 1];
-    topRight   = src[4 - srcStride];
+    bottomLeft = left[4];
+    topRight   = above[4];
 
     Vec8s v_bottomLeft(bottomLeft);
     Vec8s v_topRight(topRight);
@@ -819,51 +834,45 @@
 }
 
 #else /* if HIGH_BIT_DEPTH */
-void intra_pred_planar4(pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride)
+void intra_pred_planar4(pixel* above, pixel* left, pixel* dst, intptr_t dstStride)
 {
-    int k;
     pixel bottomLeft, topRight;
 
     // Get left and above reference column and row
-    Vec16uc im0 = (Vec16uc)load_partial(const_int(4), &src[-srcStride]); // topRow
-    Vec8s v_topRow = extend_low(im0);
-
-    int16_t leftColumn[4];
-
-    for (k = 0; k < 4; k++)
-    {
-        leftColumn[k] = src[k * srcStride - 1];
-    }
-
-    Vec8s v_leftColumn = (Vec8s)load_partial(const_int(8), (void*)leftColumn);   // leftColumn
+    __m128i im0 = _mm_cvtsi32_si128(*(int*)above); // topRow
+    __m128i v_topRow = _mm_unpacklo_epi8(im0, _mm_setzero_si128());
+
+    __m128i v_leftColumn = _mm_cvtsi32_si128(*(int*)left);  // leftColumn
+    v_leftColumn = _mm_unpacklo_epi8(v_leftColumn, _mm_setzero_si128());
 
     // Prepare intermediate variables used in interpolation
-    bottomLeft = src[4 * srcStride - 1];
-    topRight   = src[4 - srcStride];
-
-    Vec8s v_bottomLeft(bottomLeft);
-    Vec8s v_topRight(topRight);
-
-    Vec8s v_bottomRow = v_bottomLeft - v_topRow;
-    Vec8s v_rightColumn = v_topRight - v_leftColumn;
-
-    v_topRow = v_topRow << const_int(2);
-    v_leftColumn = v_leftColumn << const_int(2);
-
-    Vec8s v_horPred4 = v_leftColumn + Vec8s(4);
-    const Vec8s v_multi(1, 2, 3, 4, 5, 6, 7, 8);
-    Vec8s v_horPred, v_rightColumnN;
-    Vec8s v_im4;
-    Vec16uc v_im5;
+    bottomLeft = left[4];
+    topRight   = above[4];
+
+    __m128i v_bottomLeft = _mm_set1_epi16(bottomLeft);
+    __m128i v_topRight = _mm_set1_epi16(topRight);
+
+    __m128i v_bottomRow = _mm_sub_epi16(v_bottomLeft, v_topRow);
+    __m128i v_rightColumn = _mm_sub_epi16(v_topRight, v_leftColumn);
+
+    v_topRow = _mm_slli_epi16(v_topRow, 2);
+    v_leftColumn = _mm_slli_epi16(v_leftColumn, 2);
+
+    __m128i v_horPred4 = _mm_add_epi16(v_leftColumn, _mm_set1_epi16(4));
+    const __m128i v_multi = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+    __m128i v_horPred, v_rightColumnN;
+    __m128i v_im4;
+    __m128i v_im5;
 
 #define COMP_PRED_PLANAR4_ROW(X) { \
-        v_horPred = broadcast(const_int((X)), v_horPred4); \
-        v_rightColumnN = broadcast(const_int((X)), v_rightColumn) * v_multi; \
-        v_horPred = v_horPred + v_rightColumnN; \
-        v_topRow = v_topRow + v_bottomRow; \
-        v_im4 = (Vec8s)(v_horPred + v_topRow) >> const_int(3); \
-        v_im5 = compress_unsafe(v_im4, v_im4); \
-        store_partial(const_int(4), &dst[(X)*dstStride], v_im5); \
+        BROADCAST16(v_horPred4, (X), v_horPred); \
+        BROADCAST16(v_rightColumn, (X), v_rightColumnN); \
+        v_rightColumnN = _mm_mullo_epi16(v_rightColumnN, v_multi); \
+        v_horPred = _mm_add_epi16(v_horPred, v_rightColumnN); \
+        v_topRow = _mm_add_epi16(v_topRow, v_bottomRow); \
+        v_im4 = _mm_srai_epi16(_mm_add_epi16(v_horPred, v_topRow), 3); \
+        v_im5 = _mm_packus_epi16(v_im4, v_im4); \
+        *(int*)&dst[(X)*dstStride] = _mm_cvtsi128_si32(v_im5); \
 }
 
     COMP_PRED_PLANAR4_ROW(0)
@@ -875,19 +884,19 @@
 }
 
 #if INSTRSET >= 5
-void intra_pred_planar4_sse4(pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride)
+void intra_pred_planar4_sse4(pixel* above, pixel* left, pixel* dst, intptr_t dstStride)
 {
     pixel bottomLeft, topRight;
 
     // Get left and above reference column and row
-    __m128i im0 = _mm_cvtsi32_si128(*(uint32_t*)&src[-srcStride]); // topRow
-    __m128i v_topRow = _mm_unpacklo_epi8(im0, _mm_setzero_si128());
+    __m128i im0 = _mm_cvtsi32_si128(*(int*)above); // topRow
+    __m128i v_topRow = _mm_cvtepu8_epi16(im0);
 
     v_topRow = _mm_shuffle_epi32(v_topRow, 0x44);
 
     // Prepare intermediate variables used in interpolation
-    bottomLeft = src[4 * srcStride - 1];
-    topRight   = src[4 - srcStride];
+    bottomLeft = left[4];
+    topRight   = above[4];
 
     __m128i v_bottomLeft = _mm_set1_epi16(bottomLeft);
     __m128i v_bottomRow   = _mm_sub_epi16(v_bottomLeft, v_topRow);
@@ -904,14 +913,14 @@
     v_bottomRow = _mm_slli_epi16(v_bottomRow, 1);
 
 #define COMP_PRED_PLANAR_2ROW(Y) { \
-        _tmp0 = _mm_cvtsi32_si128((src[((Y)) * srcStride - 1] << 2) + 4); \
+        _tmp0 = _mm_cvtsi32_si128((left[(Y)] << 2) + 4); \
         _tmp0 = _mm_shufflelo_epi16(_tmp0, 0); \
-        _tmp1 = _mm_cvtsi32_si128((src[((Y)+1) * srcStride - 1] << 2) + 4); \
+        _tmp1 = _mm_cvtsi32_si128((left[((Y)+1)] << 2) + 4); \
         _tmp1 = _mm_shufflelo_epi16(_tmp1, 0); \
         v_horPred = _mm_unpacklo_epi64(_tmp0, _tmp1); \
-        _tmp0 = _mm_cvtsi32_si128(topRight - src[((Y)) * srcStride - 1]); \
+        _tmp0 = _mm_cvtsi32_si128(topRight - left[(Y)]); \
         _tmp0 = _mm_shufflelo_epi16(_tmp0, 0); \
-        _tmp1 = _mm_cvtsi32_si128(topRight - src[((Y)+1) * srcStride - 1]); \
+        _tmp1 = _mm_cvtsi32_si128(topRight - left[((Y)+1)]); \
         _tmp1 = _mm_shufflelo_epi16(_tmp1, 0); \
         v_rightColumnN = _mm_unpacklo_epi64(_tmp0, _tmp1); \
         v_rightColumnN = _mm_mullo_epi16(v_rightColumnN, v_multi_2Row); \
@@ -938,48 +947,38 @@
 #define COMP_PRED_PLANAR_ROW(X) { \
         v_horPred = permute8s<X, X, X, X, X, X, X, X>(v_horPred4); \
         v_rightColumnN = permute8s<X, X, X, X, X, X, X, X>(v_rightColumn) * v_multi; \
-        v_horPred = v_horPred + v_rightColumnN; \
-        v_topRow = v_topRow + v_bottomRow; \
-        v_im4 = (Vec8s)(v_horPred + v_topRow) >> (3 + shift); \
-        store_partial(const_int(16), &dst[X * dstStride], v_im4); \
-}
-
-void intra_pred_planar8(pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride)
+        v_horPred = _mm_add_epi16(v_horPred, v_rightColumnN); \
+        v_topRow = _mm_add_epi16(v_topRow, v_bottomRow); \
+        v_im4 = _mm_srai_epi16(_mm_add_epi16(v_horPred, v_topRow), (3 + 1)); \
+        _mm_storeu_si128((__m128i*)&dst[X * dstStride], v_im4); \
+}
+
+void intra_pred_planar8(pixel* above, pixel* left, pixel* dst, intptr_t dstStride)
 {
-    int k, bottomLeft, topRight;
-
-    int16_t leftColumn[8];
+    int bottomLeft, topRight;
 
     // Get left and above reference column and row
-    Vec8s v_topRow = (Vec8s)load_partial(const_int(16), &src[-srcStride]); // topRow
-
-    for (k = 0; k < 8; k++)
-    {
-        leftColumn[k] = src[k * srcStride - 1];
-    }
-
-    Vec8s v_leftColumn = (Vec8s)load_partial(const_int(16), leftColumn);   // leftColumn
+    __m128i v_topRow = _mm_loadu_si128((__m128i*)above); // topRow
+    __m128i v_leftColumn = _mm_loadu_si128((__m128i*)left); // leftColumn
 
     // Prepare intermediate variables used in interpolation
-    bottomLeft = src[8 * srcStride - 1];
-    topRight   = src[8 - srcStride];
-
-    Vec8s v_bottomLeft(bottomLeft);
-    Vec8s v_topRight(topRight);
-
-    Vec8s v_bottomRow = v_bottomLeft - v_topRow;
-    Vec8s v_rightColumn = v_topRight - v_leftColumn;
-
-    int shift = g_convertToBit[8];          // Using value corresponding to width = 8
-    v_topRow = v_topRow << (2 + shift);
-    v_leftColumn = v_leftColumn << (2 + shift);
+    bottomLeft = left[8];
+    topRight   = above[8];
+
+    __m128i v_bottomLeft = _mm_set1_epi16(bottomLeft);
+    __m128i v_topRight = _mm_set1_epi16(topRight);
+
+    __m128i v_bottomRow = _mm_sub_epi16(v_bottomLeft, v_topRow);
+    __m128i v_rightColumn = _mm_sub_epi16(v_topRight, v_leftColumn);
+
+    v_topRow = _mm_slli_epi16(v_topRow, (2 + 1));
+    v_leftColumn = _mm_slli_epi16(v_leftColumn, (2 + 1));
 
     // Generate prediction signal
-    Vec8s v_horPred4 = v_leftColumn + Vec8s(8);
-    const Vec8s v_multi(1, 2, 3, 4, 5, 6, 7, 8);
-    Vec8s v_horPred, v_rightColumnN;
-    Vec8s v_im4;
-    Vec16uc v_im5;
+    __m128i v_horPred4 = _mm_add_epi16(v_leftColumn, _mm_set1_epi16(8));
+    const __m128i v_multi = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+    __m128i v_horPred, v_rightColumnN;
+    __m128i v_im4;
 
     COMP_PRED_PLANAR_ROW(0);     // row 0
     COMP_PRED_PLANAR_ROW(1);
@@ -1004,27 +1003,20 @@
         store_partial(const_int(8), &dst[X * dstStride], v_im5); \
 }
 
-void intra_pred_planar8(pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride)
+void intra_pred_planar8(pixel* above, pixel* left, pixel* dst, intptr_t dstStride)
 {
-    int k;
     pixel bottomLeft, topRight;
-    int16_t leftColumn[8];
 
     // Get left and above reference column and row
-    Vec16uc im0 = (Vec16uc)load_partial(const_int(8), &src[-srcStride]); // topRow
+    Vec16uc im0 = (Vec16uc)load_partial(const_int(8), (void*)above); // topRow
     Vec8s v_topRow = extend_low(im0);
 
-    for (k = 0; k < 8; k++)
-    {
-        leftColumn[k] = src[k * srcStride - 1];
-    }
-
-    Vec8s v_leftColumn;
-    v_leftColumn.load(leftColumn);   // leftColumn
+    Vec8s v_leftColumn = _mm_loadl_epi64((__m128i*)left);   // leftColumn
+    v_leftColumn = _mm_unpacklo_epi8(v_leftColumn, _mm_setzero_si128());
 
     // Prepare intermediate variables used in interpolation
-    bottomLeft = src[8 * srcStride - 1];
-    topRight   = src[8 - srcStride];
+    bottomLeft = left[8];
+    topRight   = above[8];
 
     Vec8s v_bottomLeft(bottomLeft);
     Vec8s v_topRight(topRight);
@@ -1055,29 +1047,18 @@
 #undef COMP_PRED_PLANAR_ROW
 
 #if INSTRSET >= 5
-void intra_pred_planar8_sse4(pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride)
+void intra_pred_planar8_sse4(pixel* above, pixel* left, pixel* dst, intptr_t dstStride)
 {
     pixel bottomLeft, topRight;
 
     // Get left and above reference column and row
-    __m128i im0 = _mm_loadl_epi64((__m128i*)&src[0 - srcStride]); // topRow
-    __m128i v_topRow = _mm_unpacklo_epi8(im0, _mm_setzero_si128());
-
-    __m128i v_leftColumn = _mm_setzero_si128();
-
-    v_leftColumn = _mm_insert_epi8(v_leftColumn, src[0 * srcStride - 1], 0);
-    v_leftColumn = _mm_insert_epi8(v_leftColumn, src[1 * srcStride - 1], 1);
-    v_leftColumn = _mm_insert_epi8(v_leftColumn, src[2 * srcStride - 1], 2);
-    v_leftColumn = _mm_insert_epi8(v_leftColumn, src[3 * srcStride - 1], 3);
-    v_leftColumn = _mm_insert_epi8(v_leftColumn, src[4 * srcStride - 1], 4);
-    v_leftColumn = _mm_insert_epi8(v_leftColumn, src[5 * srcStride - 1], 5);
-    v_leftColumn = _mm_insert_epi8(v_leftColumn, src[6 * srcStride - 1], 6);
-    v_leftColumn = _mm_insert_epi8(v_leftColumn, src[7 * srcStride - 1], 7);
-    v_leftColumn = _mm_unpacklo_epi8(v_leftColumn, _mm_setzero_si128());
+    __m128i v_topRow = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)above)); // topRow
+
+    __m128i v_leftColumn = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)left));
 
     // Prepare intermediate variables used in interpolation
-    bottomLeft = src[8 * srcStride - 1];
-    topRight   = src[8 - srcStride];
+    bottomLeft = left[8];
+    topRight   = above[8];
 
     __m128i v_bottomLeft = _mm_set1_epi16(bottomLeft);
     __m128i v_topRight   = _mm_set1_epi16(topRight);
@@ -1094,18 +1075,8 @@
     __m128i v_im5;
 
 #define COMP_PRED_PLANAR_ROW(Y) { \
-        if ((Y) < 4) { \
-            v_horPred = _mm_shufflelo_epi16(v_horPred4, ((Y) & 3) * 0x55); \
-            v_horPred = _mm_unpacklo_epi64(v_horPred, v_horPred); \
-            v_rightColumnN = _mm_shufflelo_epi16(v_rightColumn, ((Y) & 3) * 0x55); \
-            v_rightColumnN = _mm_unpacklo_epi64(v_rightColumnN, v_rightColumnN); \
-        } \
-        else { \
-            v_horPred = _mm_shufflehi_epi16(v_horPred4, ((Y) & 3) * 0x55); \
-            v_horPred = _mm_unpackhi_epi64(v_horPred, v_horPred); \
-            v_rightColumnN = _mm_shufflehi_epi16(v_rightColumn, ((Y) & 3) * 0x55); \
-            v_rightColumnN = _mm_unpackhi_epi64(v_rightColumnN, v_rightColumnN); \
-        } \
+        BROADCAST16(v_horPred4, (Y), v_horPred); \
+        BROADCAST16(v_rightColumn, (Y), v_rightColumnN); \
         v_rightColumnN = _mm_mullo_epi16(v_rightColumnN, v_multiL); \
         v_horPred = _mm_add_epi16(v_horPred, v_rightColumnN); \
         v_topRow = _mm_add_epi16(v_topRow, v_bottomRow); \
@@ -1148,29 +1119,22 @@
         v_im4_hi.store(&dst[X * dstStride + 8]); \
 }
 
-void intra_pred_planar16(pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride)
+void intra_pred_planar16(pixel* above, pixel* left, pixel* dst, intptr_t dstStride)
 {
-    int k;
     pixel bottomLeft, topRight;
-    int16_t leftColumn[16];
 
     // Get left and above reference column and row
     Vec8s v_topRow_lo, v_topRow_hi;
 
-    v_topRow_lo.load(&src[-srcStride]);
-    v_topRow_hi.load(&src[-srcStride + 8]);
-
-    for (k = 0; k < 16; k++)
-    {
-        leftColumn[k] = src[k * srcStride - 1];
-    }
+    v_topRow_lo.load(&above[0]);
+    v_topRow_hi.load(&above[8]);
 
     Vec8s v_leftColumn;
-    v_leftColumn.load(leftColumn);   // leftColumn
+    v_leftColumn.load(left);   // leftColumn
 
     // Prepare intermediate variables used in interpolation
-    bottomLeft = src[16 * srcStride - 1];
-    topRight   = src[16 - srcStride];
+    bottomLeft = left[16];
+    topRight   = above[16];
 
     Vec8s v_bottomLeft(bottomLeft);
     Vec8s v_topRight(topRight);
@@ -1200,7 +1164,7 @@
     COMP_PRED_PLANAR_ROW(6);
     COMP_PRED_PLANAR_ROW(7);     // row 7
 
-    v_leftColumn.load(leftColumn + 8);   // leftColumn lower 8 rows
+    v_leftColumn.load(left + 8);   // leftColumn lower 8 rows
     v_rightColumn = v_topRight - v_leftColumn;
     v_leftColumn = v_leftColumn << (2 + shift);
     v_horPred4 = v_leftColumn + Vec8s(16);
@@ -1235,28 +1199,20 @@
         store_partial(const_int(16), &dst[X * dstStride], v_im5); \
 }
 
-void intra_pred_planar16(pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride)
+void intra_pred_planar16(pixel* above, pixel* left, pixel* dst, intptr_t dstStride)
 {
-    int k;
     pixel bottomLeft, topRight;
-    int16_t leftColumn[16];
 
     // Get left and above reference column and row
-    Vec16uc im0 = (Vec16uc)load_partial(const_int(16), &src[-srcStride]); // topRow
+    Vec16uc im0 = (Vec16uc)load_partial(const_int(16), above); // topRow
     Vec8s v_topRow_lo = extend_low(im0);
     Vec8s v_topRow_hi = extend_high(im0);
 
-    for (k = 0; k < 16; k++)
-    {
-        leftColumn[k] = src[k * srcStride - 1];
-    }
-
-    Vec8s v_leftColumn;
-    v_leftColumn.load(leftColumn);   // leftColumn
+    Vec8s v_leftColumn = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)left), _mm_setzero_si128());
 
     // Prepare intermediate variables used in interpolation
-    bottomLeft = src[16 * srcStride - 1];
-    topRight   = src[16 - srcStride];
+    bottomLeft = left[16];
+    topRight   = above[16];
 
     Vec8s v_bottomLeft(bottomLeft);
     Vec8s v_topRight(topRight);
@@ -1286,7 +1242,8 @@
     COMP_PRED_PLANAR_ROW(6);
     COMP_PRED_PLANAR_ROW(7);     // row 7
 
-    v_leftColumn.load(leftColumn + 8);   // leftColumn lower 8 rows
+    // leftColumn lower 8 rows
+    v_leftColumn = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(left + 8)), _mm_setzero_si128());
     v_rightColumn = v_topRight - v_leftColumn;
     v_leftColumn = v_leftColumn << (2 + shift);
     v_horPred4 = v_leftColumn + Vec8s(16);
@@ -1304,21 +1261,21 @@
 #undef COMP_PRED_PLANAR_ROW
 
 #if INSTRSET >= 5
-void intra_pred_planar16_sse4(pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride)
+void intra_pred_planar16_sse4(pixel* above, pixel* left, pixel* dst, intptr_t dstStride)
 {
     pixel bottomLeft, topRight;
     __m128i v_topRow[2];
     __m128i v_bottomRow[2];
 
     // Get left and above reference column and row
-    __m128i im0 = _mm_loadu_si128((__m128i*)&src[0 - srcStride]); // topRow
+    __m128i im0 = _mm_loadu_si128((__m128i*)above); // topRow
 
     v_topRow[0] = _mm_unpacklo_epi8(im0, _mm_setzero_si128());
     v_topRow[1] = _mm_unpackhi_epi8(im0, _mm_setzero_si128());
 
     // Prepare intermediate variables used in interpolation
-    bottomLeft = src[16 * srcStride - 1];
-    topRight   = src[16 - srcStride];
+    bottomLeft = left[16];
+    topRight   = above[16];
 
     __m128i v_bottomLeft = _mm_set1_epi16(bottomLeft);
 
@@ -1333,10 +1290,10 @@
     __m128i v_im5;
 
 #define COMP_PRED_PLANAR_ROW(Y) { \
-        v_horPred = _mm_cvtsi32_si128((src[(Y)*srcStride - 1] << 4) + 16); \
+        v_horPred = _mm_cvtsi32_si128((left[(Y)] << 4) + 16); \
         v_horPred = _mm_shufflelo_epi16(v_horPred, 0); \
         v_horPred = _mm_shuffle_epi32(v_horPred, 0); \
-        __m128i _tmp = _mm_cvtsi32_si128(topRight - src[(Y)*srcStride - 1]); \
+        __m128i _tmp = _mm_cvtsi32_si128(topRight - left[(Y)]); \
         _tmp = _mm_shufflelo_epi16(_tmp, 0); \
         _tmp = _mm_shuffle_epi32(_tmp, 0); \
         v_rightColumnN[0] = _mm_mullo_epi16(_tmp, v_multiL); \
@@ -1374,15 +1331,15 @@
 #endif // INSTRSET >= 5
 
 #if INSTRSET >= 5
-void intra_pred_planar32_sse4(pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride)
+void intra_pred_planar32_sse4(pixel* above, pixel* left, pixel* dst, intptr_t dstStride)
 {
     pixel bottomLeft, topRight;
     __m128i v_topRow[4];
     __m128i v_bottomRow[4];
 
     // Get left and above reference column and row
-    __m128i im0 = _mm_loadu_si128((__m128i*)&src[0 - srcStride]); // topRow
-    __m128i im1 = _mm_loadu_si128((__m128i*)&src[16 - srcStride]); // topRow
+    __m128i im0 = _mm_loadu_si128((__m128i*)&above[0]); // topRow
+    __m128i im1 = _mm_loadu_si128((__m128i*)&above[16]); // topRow
 
     v_topRow[0] = _mm_unpacklo_epi8(im0, _mm_setzero_si128());
     v_topRow[1] = _mm_unpackhi_epi8(im0, _mm_setzero_si128());
@@ -1390,8 +1347,8 @@
     v_topRow[3] = _mm_unpackhi_epi8(im1, _mm_setzero_si128());
 
     // Prepare intermediate variables used in interpolation
-    bottomLeft = src[32 * srcStride - 1];
-    topRight   = src[32 - srcStride];
+    bottomLeft = left[32];
+    topRight   = above[32];
 
     __m128i v_bottomLeft = _mm_set1_epi16(bottomLeft);
 
@@ -1410,10 +1367,10 @@
     __m128i v_im5[2];
 
 #define COMP_PRED_PLANAR_ROW(Y) { \
-        v_horPred = _mm_cvtsi32_si128((src[(Y)*srcStride - 1] << 5) + 32); \
+        v_horPred = _mm_cvtsi32_si128((left[(Y)] << 5) + 32); \
         v_horPred = _mm_shufflelo_epi16(v_horPred, 0); \
         v_horPred = _mm_shuffle_epi32(v_horPred, 0); \
-        __m128i _tmp = _mm_cvtsi32_si128(topRight - src[(Y)*srcStride - 1]); \
+        __m128i _tmp = _mm_cvtsi32_si128(topRight - left[(Y)]); \
         _tmp = _mm_shufflelo_epi16(_tmp, 0); \
         _tmp = _mm_shuffle_epi32(_tmp, 0); \
         v_rightColumnN[0] = _mm_mullo_epi16(_tmp, v_multiL); \
@@ -1451,17 +1408,17 @@
 #endif // INSTRSET >= 5
 
 #if INSTRSET >= 5
-void intra_pred_planar64_sse4(pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride)
+void intra_pred_planar64_sse4(pixel* above, pixel* left, pixel* dst, intptr_t dstStride)
 {
     pixel bottomLeft, topRight;
     __m128i v_topRow[8];
     __m128i v_bottomRow[8];
 
     // Get left and above reference column and row
-    __m128i im0 = _mm_loadu_si128((__m128i*)&src[0 - srcStride]); // topRow
-    __m128i im1 = _mm_loadu_si128((__m128i*)&src[16 - srcStride]); // topRow
-    __m128i im2 = _mm_loadu_si128((__m128i*)&src[32 - srcStride]); // topRow
-    __m128i im3 = _mm_loadu_si128((__m128i*)&src[48 - srcStride]); // topRow
+    __m128i im0 = _mm_loadu_si128((__m128i*)&above[0]); // topRow
+    __m128i im1 = _mm_loadu_si128((__m128i*)&above[16]); // topRow
+    __m128i im2 = _mm_loadu_si128((__m128i*)&above[32]); // topRow
+    __m128i im3 = _mm_loadu_si128((__m128i*)&above[48]); // topRow
 
     v_topRow[0] = _mm_unpacklo_epi8(im0, _mm_setzero_si128());
     v_topRow[1] = _mm_unpackhi_epi8(im0, _mm_setzero_si128());
@@ -1473,8 +1430,8 @@
     v_topRow[7] = _mm_unpackhi_epi8(im3, _mm_setzero_si128());
 
     // Prepare intermediate variables used in interpolation
-    bottomLeft = src[64 * srcStride - 1];
-    topRight   = src[64 - srcStride];
+    bottomLeft = left[64];
+    topRight   = above[64];
 
     __m128i v_bottomLeft = _mm_set1_epi16(bottomLeft);
 
@@ -1501,10 +1458,10 @@
     __m128i v_im5[4];
 
 #define COMP_PRED_PLANAR_ROW(Y) { \
-        v_horPred = _mm_cvtsi32_si128((src[(Y)*srcStride - 1] << 6) + 64); \
+        v_horPred = _mm_cvtsi32_si128((left[(Y)] << 6) + 64); \
         v_horPred = _mm_shufflelo_epi16(v_horPred, 0); \
         v_horPred = _mm_shuffle_epi32(v_horPred, 0); \
-        __m128i _tmp = _mm_cvtsi32_si128(topRight - src[(Y)*srcStride - 1]); \
+        __m128i _tmp = _mm_cvtsi32_si128(topRight - left[(Y)]); \
         _tmp = _mm_shufflelo_epi16(_tmp, 0); \
         _tmp = _mm_shuffle_epi32(_tmp, 0); \
         v_rightColumnN[0] = _mm_mullo_epi16(_tmp, v_multiL); \
@@ -1563,7 +1520,7 @@
 
 #endif /* if HIGH_BIT_DEPTH */
 
-typedef void intra_pred_planar_t (pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride);
+typedef void intra_pred_planar_t (pixel* above, pixel* left, pixel* dst, intptr_t dstStride);
 intra_pred_planar_t *intraPlanarN[] =
 {
 #if !HIGH_BIT_DEPTH && INSTRSET >= 5
@@ -1579,18 +1536,18 @@
 #endif
 };
 
-void intra_pred_planar(pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width)
+void intra_pred_planar(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int width)
 {
     int nLog2Size = g_convertToBit[width] + 2;
 
 #if (!HIGH_BIT_DEPTH) && (INSTRSET >= 5)
-    intraPlanarN[nLog2Size - 2](src, srcStride, dst, dstStride);
+    intraPlanarN[nLog2Size - 2](above, left, dst, dstStride);
     return;
 #else
     int k, l, bottomLeft, topRight;
     int horPred;
     // OPT_ME: when width is 64, the shift1D is 8, then the dynamic range is [-65280, 65280], so we have to use 32 bits here
-    int32_t leftColumn[MAX_CU_SIZE + 1], topRow[MAX_CU_SIZE + 1];
+    int32_t leftColumn[MAX_CU_SIZE], topRow[MAX_CU_SIZE];
     // CHECK_ME: dynamic range is 9 bits or 15 bits(I assume max input bit_depth is 14 bits)
     int16_t bottomRow[MAX_CU_SIZE], rightColumn[MAX_CU_SIZE];
     int blkSize = width;
@@ -1600,20 +1557,20 @@
 
     if (width < 32)
     {
-        intraPlanarN[nLog2Size - 2](src, srcStride, dst, dstStride);
+        intraPlanarN[nLog2Size - 2](above, left, dst, dstStride);
         return;
     }
 
     // Get left and above reference column and row
-    for (k = 0; k < blkSize + 1; k++)
+    for (k = 0; k < blkSize; k++)
     {
-        topRow[k] = src[k - srcStride];
-        leftColumn[k] = src[k * srcStride - 1];
+        topRow[k] = above[k];
+        leftColumn[k] = left[k];
     }
 
     // Prepare intermediate variables used in interpolation
-    bottomLeft = leftColumn[blkSize];
-    topRight   = topRow[blkSize];
+    bottomLeft = left[blkSize];
+    topRight   = above[blkSize];
     for (k = 0; k < blkSize; k++)
     {
         bottomRow[k]   = bottomLeft - topRow[k];
diff -r f813f110d69a -r a4013cdafef0 source/encoder/compress.cpp
--- a/source/encoder/compress.cpp Thu Jul 18 02:10:37 2013 -0500
+++ b/source/encoder/compress.cpp Thu Jul 18 15:52:51 2013 +0800
@@ -123,23 +123,25 @@
         CandNum = 0;
         UInt modeCosts[35];
         Bool bFilter = (width <= 16);
-        Pel *src = m_search->getPredicBuf();
 
         Pel *pAbove0 = m_search->refAbove    + width - 1;
         Pel *pAbove1 = m_search->refAboveFlt + width - 1;
         Pel *pLeft0  = m_search->refLeft     + width - 1;
         Pel *pLeft1  = m_search->refLeftFlt  + width - 1;
+        Pel *pAbove  = pAbove0;
+        Pel *pLeft   = pLeft0;
 
         // 1
-        primitives.intra_pred_dc(pAbove0 + 1, pLeft0 + 1, pred, stride, width, bFilter);
+        primitives.intra_pred_dc((pixel*)pAbove0 + 1, (pixel*)pLeft0 + 1, pred, stride, width, bFilter);
         modeCosts[DC_IDX] = sa8d(fenc, stride, pred, stride);
 
         // 0
         if (width >= 8 && width <= 32)
         {
-            src += ADI_BUF_STRIDE * (2 * width + 1);
+            pAbove = pAbove1;
+            pLeft  = pLeft1;
         }
-        primitives.intra_pred_planar(src + ADI_BUF_STRIDE + 1, ADI_BUF_STRIDE, pred, stride, width);
+        primitives.intra_pred_planar(pAbove + 1, pLeft + 1, pred, stride, width);
         modeCosts[PLANAR_IDX] = sa8d(fenc, stride, pred, stride);
 
         // 33 Angle modes once
diff -r f813f110d69a -r a4013cdafef0 source/test/intrapredharness.cpp
--- a/source/test/intrapredharness.cpp Thu Jul 18 02:10:37 2013 -0500
+++ b/source/test/intrapredharness.cpp Thu Jul 18 15:52:51 2013 +0800
@@ -117,17 +117,26 @@
     {
         for (int i = 0; i <= 100; i++)
         {
+            pixel left[MAX_CU_SIZE * 2 + 1];
+            for (int k = 0; k < width * 2 + 1; k++)
+            {
+                left[k] = pixel_buff[j - 1 + k * ADI_BUF_STRIDE];
+            }
 #if _DEBUG
             memset(pixel_out_vec, 0xCD, out_size);
             memset(pixel_out_c, 0xCD, out_size);
 #endif
-            ref(pixel_buff + j, ADI_BUF_STRIDE, pixel_out_c,   FENC_STRIDE, width);
-            opt(pixel_buff + j, ADI_BUF_STRIDE, pixel_out_vec, FENC_STRIDE, width);
+            ref(pixel_buff + j - ADI_BUF_STRIDE, left + 1, pixel_out_c,   FENC_STRIDE, width);
+            opt(pixel_buff + j - ADI_BUF_STRIDE, left + 1, pixel_out_vec, FENC_STRIDE, width);
 
             for (int k = 0; k < width; k++)
             {
                 if (memcmp(pixel_out_vec + k * FENC_STRIDE, pixel_out_c + k * FENC_STRIDE, width))
                 {
+#if _DEBUG
+                    ref(pixel_buff + j - ADI_BUF_STRIDE, left + 1, pixel_out_c,   FENC_STRIDE, width);
+                    opt(pixel_buff + j - ADI_BUF_STRIDE, left + 1, pixel_out_vec, FENC_STRIDE, width);
+#endif
                     return false;
                 }
             }
@@ -292,7 +301,7 @@
             width = ii;
             printf("intra_planar%2dx%d", ii, ii);
             REPORT_SPEEDUP(opt.intra_pred_planar, ref.intra_pred_planar,
-                           pixel_buff + srcStride, srcStride, pixel_out_vec, FENC_STRIDE, width);
+                           pixel_buff + srcStride, pixel_buff, pixel_out_vec, FENC_STRIDE, width);
         }
     }
     if (opt.intra_pred_ang)
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20130718/1244b81a/attachment-0001.html>