<div style="line-height:1.7;color:#000000;font-size:14px;font-family:arial"><P># HG changeset patch<BR># User Min Chen <<A href="mailto:chenm003@163.com">chenm003@163.com</A>><BR># Date 1374133971 -28800<BR># Node ID a4013cdafef00502efe0d496dcb3c4f2bce966a4<BR># Parent  f813f110d69a1a6650e813dd4e612216982a0264<BR>intrapred: improvement intra_pred_planar</P>
<P>diff -r f813f110d69a -r a4013cdafef0 source/Lib/TLibCommon/TComPrediction.cpp<BR>--- a/source/Lib/TLibCommon/TComPrediction.cpp Thu Jul 18 02:10:37 2013 -0500<BR>+++ b/source/Lib/TLibCommon/TComPrediction.cpp Thu Jul 18 15:52:51 2013 +0800<BR>@@ -177,17 +177,16 @@<BR>     }<BR> <BR>     // get starting pixel in block<BR>-    Int sw = ADI_BUF_STRIDE;<BR>     Bool bFilter = (size <= 16);<BR> <BR>     // Create the prediction<BR>     if (dirMode == PLANAR_IDX)<BR>     {<BR>-        primitives.intra_pred_planar(src + sw + 1, sw, dst, stride, size);<BR>+        primitives.intra_pred_planar((pixel*)refAbv + 1, (pixel*)refLft + 1, (pixel*)dst, stride, size);<BR>     }<BR>     else if (dirMode == DC_IDX)<BR>   
   {<BR>-        primitives.intra_pred_dc(refAbv + 1, refLft + 1, dst, stride, size, bFilter);<BR>+        primitives.intra_pred_dc((pixel*)refAbv + 1, (pixel*)refLft + 1, (pixel*)dst, stride, size, bFilter);<BR>     }<BR>     else<BR>     {<BR>@@ -198,33 +197,28 @@<BR> // Angular chroma<BR> Void TComPrediction::predIntraChromaAng(Pel* src, UInt dirMode, Pel* dst, UInt stride, Int width)<BR> {<BR>+    // Create the prediction<BR>+    Pel refAbv[3 * MAX_CU_SIZE];<BR>+    Pel refLft[3 * MAX_CU_SIZE];<BR>+    int limit = (dirMode <= 25 && dirMode >= 11) ? (width + 1 + 1) : (2 * width + 1);<BR>+    memcpy(refAbv + width - 1, src, (limit) * sizeof(Pel));<BR>+    for (int k = 0; k < limit; k++)<BR>+    {<BR>+  &n
 bsp;     refLft[k + width - 1] = src[k * ADI_BUF_STRIDE];<BR>+    }<BR>+<BR>     // get starting pixel in block<BR>-    Int sw = ADI_BUF_STRIDE;<BR>-<BR>     if (dirMode == PLANAR_IDX)<BR>     {<BR>-        primitives.intra_pred_planar(src + sw + 1, sw, dst, stride, width);<BR>+        primitives.intra_pred_planar((pixel*)refAbv + width - 1 + 1, (pixel*)refLft + width - 1 + 1, (pixel*)dst, stride, width);<BR>+    }<BR>+    else if (dirMode == DC_IDX)<BR>+    {<BR>+        primitives.intra_pred_dc(refAbv + width - 1 + 1, refLft + width - 1 + 1, dst, stride, width, false);<BR>     }<BR>     else<BR>     {<BR>-        // Create the predi
 ction<BR>-        Pel refAbv[3 * MAX_CU_SIZE];<BR>-        Pel refLft[3 * MAX_CU_SIZE];<BR>-        int limit = (dirMode <= 25 && dirMode >= 11) ? (width + 1) : (2 * width + 1);<BR>-        memcpy(refAbv + width - 1, src, (limit) * sizeof(Pel));<BR>-        for (int k = 0; k < limit; k++)<BR>-        {<BR>-            refLft[k + width - 1] = src[k * sw];<BR>-        }<BR>-<BR>-        if (dirMode == DC_IDX)<BR>-        {<BR>-            primitives.intra_pred_dc(refAbv + width - 1 + 1, refLft + width - 1 + 1, dst, stride, width, false);<BR>- &nb
 sp;      }<BR>-        else<BR>-        {<BR>-            primitives.intra_pred_ang(dst, stride, width, dirMode, false, refLft + width - 1, refAbv + width - 1);<BR>-        }<BR>+        primitives.intra_pred_ang(dst, stride, width, dirMode, false, refLft + width - 1, refAbv + width - 1);<BR>     }<BR> }<BR> <BR>diff -r f813f110d69a -r a4013cdafef0 source/Lib/TLibEncoder/TEncSearch.cpp<BR>--- a/source/Lib/TLibEncoder/TEncSearch.cpp Thu Jul 18 02:10:37 2013 -0500<BR>+++ b/source/Lib/TLibEncoder/TEncSearch.cpp Thu Jul 18 15:52:51 2013 +0800<BR>@@ -1975,6 +1975,8 @@<BR>             Pel *pAbove1 = refAboveFlt + width - 1;<BR>        
      Pel *pLeft0  = refLeft     + width - 1;<BR>             Pel *pLeft1  = refLeftFlt  + width - 1;<BR>+            Pel *above   = pAbove0;<BR>+            Pel *left    = pLeft0;<BR> <BR>             // 1<BR>             primitives.intra_pred_dc(pAbove0 + 1, pLeft0 + 1, pred, stride, width, bFilter);<BR>@@ -1984,8 +1986,10 @@<BR>             if (width >= 8 && width <= 32)<BR>             {<BR>               &
 nbsp; predSrc += ADI_BUF_STRIDE * (2 * width + 1);<BR>+                above = pAbove1;<BR>+                left  = pLeft1;<BR>             }<BR>-            primitives.intra_pred_planar(predSrc + ADI_BUF_STRIDE + 1, ADI_BUF_STRIDE, pred, stride, width);<BR>+            primitives.intra_pred_planar((pixel*)above + 1, (pixel*)left + 1, pred, stride, width);<BR>             modeCosts[PLANAR_IDX] = sa8d(fenc, stride, pred, stride);<BR> <BR>             // 33 Angle modes once<BR>diff -r f813f110d69a -r a4013cdafef0 source/common/intrapred.cpp<BR
 >--- a/source/common/intrapred.cpp Thu Jul 18 02:10:37 2013 -0500<BR>+++ b/source/common/intrapred.cpp Thu Jul 18 15:52:51 2013 +0800<BR>@@ -98,7 +98,7 @@<BR>     }<BR> }<BR> <BR>-void PredIntraPlanar(pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width)<BR>+void PredIntraPlanar(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int width)<BR> {<BR>     //assert(width == height);<BR> <BR>@@ -117,8 +117,8 @@<BR>     // Get left and above reference column and row<BR>     for (k = 0; k < blkSize + 1; k++)<BR>     {<BR>-        topRow[k] = src[k - srcStride];<BR>-        leftColumn[k] = src[k * srcStride - 1];<BR>+        topRow[k] = above[k];<BR>+        leftColumn[k] = left[k];<BR> &
 nbsp;   }<BR> <BR>     // Prepare intermediate variables used in interpolation<BR>diff -r f813f110d69a -r a4013cdafef0 source/common/primitives.h<BR>--- a/source/common/primitives.h Thu Jul 18 02:10:37 2013 -0500<BR>+++ b/source/common/primitives.h Thu Jul 18 15:52:51 2013 +0800<BR>@@ -195,7 +195,7 @@<BR> typedef void (*pixelsub_sp_t)(int bx, int by, short *dst, intptr_t dstride, pixel *src0, pixel *src1, intptr_t sstride0, intptr_t sstride1); <BR> <BR> typedef void (*intra_dc_t)(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int width, int bFilter);<BR>-typedef void (*intra_planar_t)(pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width);<BR>+typedef void (*intra_planar_t)(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int width);<BR> typedef void (*intra_ang_t)(pixel* dst, int dstStride, int width, int dirMode, bool bFilter, pixel *refLeft, pixel *refAbove);<BR> typede
 f void (*intra_allangs_t)(pixel *dst, pixel *above0, pixel *left0, pixel *above1, pixel *left1, bool bLuma);<BR> <BR>diff -r f813f110d69a -r a4013cdafef0 source/common/vec/intrapred.inc<BR>--- a/source/common/vec/intrapred.inc Thu Jul 18 02:10:37 2013 -0500<BR>+++ b/source/common/vec/intrapred.inc Thu Jul 18 15:52:51 2013 +0800<BR>@@ -746,27 +746,42 @@<BR> #endif // if HIGH_BIT_DEPTH<BR> }<BR> <BR>+#if INSTRSET >= 4  // SSSE3<BR>+    #define BROADCAST16(a, d, x) { \<BR>+        const __m128i mask = _mm_set1_epi16( (((d) * 2) | ((d) * 2 + 1) << 8) ); \<BR>+        (x) = _mm_shuffle_epi8((a), mask); \<BR>+    }<BR>+#else<BR>+    #define BROADCAST16(a, d, x) { \<BR>+        const int dL = (d) & 3; \<BR>+        const int dH = ((d)-4) & 3; \<BR>+ &n
 bsp;      if (d>=4) { \<BR>+            (x) = _mm_shufflehi_epi16((a), dH * 0x55); \<BR>+            (x) = _mm_unpackhi_epi64((x), (x)); \<BR>+        } \<BR>+        else { \<BR>+            (x) = _mm_shufflelo_epi16((a), dL * 0x55); \<BR>+            (x) = _mm_unpacklo_epi64((x), (x)); \<BR>+        } \<BR>+    }<BR>+#endif<BR>+<BR>+<BR> #if HIGH_BIT_DEPTH<BR>-// CHECK_ME: I am not sure the v_rightColumnN will be overflow when input as 12bpp<BR>-void intra_pred_planar4(pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride)<BR>+// CHECK_ME: I am not sure the v_rightColumnN will be overflow when input is 12b
 pp<BR>+void intra_pred_planar4(pixel* above, pixel* left, pixel* dst, intptr_t dstStride)<BR> {<BR>-    int k, bottomLeft, topRight;<BR>+    int bottomLeft, topRight;<BR>     // NOTE: I use 16-bits is enough here, because we have least than 13-bits as input, and shift left by 2, it is 15-bits<BR>-    int16_t leftColumn[4];<BR> <BR>     // Get left and above reference column and row<BR>-    Vec8s v_topRow = (Vec8s)load_partial(const_int(8), &src[-srcStride]); // topRow<BR>-<BR>-    for (k = 0; k < 4; k++)<BR>-    {<BR>-        leftColumn[k] = src[k * srcStride - 1];<BR>-    }<BR>-<BR>-    Vec8s v_leftColumn = (Vec8s)load_partial(const_int(8), leftColumn);   // leftColumn<BR>+    Vec8s v_topRow = (Vec8s)load_partial(const_int(8), above); // topRow<BR>+
 <BR>+    Vec8s v_leftColumn = (Vec8s)load_partial(const_int(8), left);   // leftColumn<BR> <BR>     // Prepare intermediate variables used in interpolation<BR>-    bottomLeft = src[4 * srcStride - 1];<BR>-    topRight   = src[4 - srcStride];<BR>+    bottomLeft = left[4];<BR>+    topRight   = above[4];<BR> <BR>     Vec8s v_bottomLeft(bottomLeft);<BR>     Vec8s v_topRight(topRight);<BR>@@ -819,51 +834,45 @@<BR> }<BR> <BR> #else /* if HIGH_BIT_DEPTH */<BR>-void intra_pred_planar4(pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride)<BR>+void intra_pred_planar4(pixel* above, pixel* left, pixel* dst, intptr_t dstStride)<BR> {<BR>-    int k;<BR>     pixel bottomLeft, topRight;<BR> <BR>     // Get left and above reference column 
 and row<BR>-    Vec16uc im0 = (Vec16uc)load_partial(const_int(4), &src[-srcStride]); // topRow<BR>-    Vec8s v_topRow = extend_low(im0);<BR>-<BR>-    int16_t leftColumn[4];<BR>-<BR>-    for (k = 0; k < 4; k++)<BR>-    {<BR>-        leftColumn[k] = src[k * srcStride - 1];<BR>-    }<BR>-<BR>-    Vec8s v_leftColumn = (Vec8s)load_partial(const_int(8), (void*)leftColumn);   // leftColumn<BR>+    __m128i im0 = _mm_cvtsi32_si128(*(int*)above); // topRow<BR>+    __m128i v_topRow = _mm_unpacklo_epi8(im0, _mm_setzero_si128());<BR>+<BR>+    __m128i v_leftColumn = _mm_cvtsi32_si128(*(int*)left);  // leftColumn<BR>+    v_leftColumn = _mm_unpacklo_epi8(v_leftColumn, _mm_setzero_si128());<BR> <BR>     // Prepare intermediate variables used in interpolatio
 n<BR>-    bottomLeft = src[4 * srcStride - 1];<BR>-    topRight   = src[4 - srcStride];<BR>-<BR>-    Vec8s v_bottomLeft(bottomLeft);<BR>-    Vec8s v_topRight(topRight);<BR>-<BR>-    Vec8s v_bottomRow = v_bottomLeft - v_topRow;<BR>-    Vec8s v_rightColumn = v_topRight - v_leftColumn;<BR>-<BR>-    v_topRow = v_topRow << const_int(2);<BR>-    v_leftColumn = v_leftColumn << const_int(2);<BR>-<BR>-    Vec8s v_horPred4 = v_leftColumn + Vec8s(4);<BR>-    const Vec8s v_multi(1, 2, 3, 4, 5, 6, 7, 8);<BR>-    Vec8s v_horPred, v_rightColumnN;<BR>-    Vec8s v_im4;<BR>-    Vec16uc v_im5;<BR>+    bottomLeft = left[4];<BR>+    topRight   = above[4];<BR>+<BR>+    __m128i v_bottomLeft = _mm_set1_epi16(bottomLeft);<BR>+  &n
 bsp; __m128i v_topRight = _mm_set1_epi16(topRight);<BR>+<BR>+    __m128i v_bottomRow = _mm_sub_epi16(v_bottomLeft, v_topRow);<BR>+    __m128i v_rightColumn = _mm_sub_epi16(v_topRight, v_leftColumn);<BR>+<BR>+    v_topRow = _mm_slli_epi16(v_topRow, 2);<BR>+    v_leftColumn = _mm_slli_epi16(v_leftColumn, 2);<BR>+<BR>+    __m128i v_horPred4 = _mm_add_epi16(v_leftColumn, _mm_set1_epi16(4));<BR>+    const __m128i v_multi = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);<BR>+    __m128i v_horPred, v_rightColumnN;<BR>+    __m128i v_im4;<BR>+    __m128i v_im5;<BR> <BR> #define COMP_PRED_PLANAR4_ROW(X) { \<BR>-        v_horPred = broadcast(const_int((X)), v_horPred4); \<BR>-        v_rightColumnN = broadcast(const_int((X)), v_rightColumn) * v_multi; \<BR>-     &nb
 sp;  v_horPred = v_horPred + v_rightColumnN; \<BR>-        v_topRow = v_topRow + v_bottomRow; \<BR>-        v_im4 = (Vec8s)(v_horPred + v_topRow) >> const_int(3); \<BR>-        v_im5 = compress_unsafe(v_im4, v_im4); \<BR>-        store_partial(const_int(4), &dst[(X)*dstStride], v_im5); \<BR>+        BROADCAST16(v_horPred4, (X), v_horPred); \<BR>+        BROADCAST16(v_rightColumn, (X), v_rightColumnN); \<BR>+        v_rightColumnN = _mm_mullo_epi16(v_rightColumnN, v_multi); \<BR>+        v_horPred = _mm_add_epi16(v_horPred, v_rightColumnN); \<BR>+        v_topRow = _mm_add_epi16(v_topRow, v_bottomRow); \<BR>+        v_im4 = _
 mm_srai_epi16(_mm_add_epi16(v_horPred, v_topRow), 3); \<BR>+        v_im5 = _mm_packus_epi16(v_im4, v_im4); \<BR>+        *(int*)&dst[(X)*dstStride] = _mm_cvtsi128_si32(v_im5); \<BR> }<BR> <BR>     COMP_PRED_PLANAR4_ROW(0)<BR>@@ -875,19 +884,19 @@<BR> }<BR> <BR> #if INSTRSET >= 5<BR>-void intra_pred_planar4_sse4(pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride)<BR>+void intra_pred_planar4_sse4(pixel* above, pixel* left, pixel* dst, intptr_t dstStride)<BR> {<BR>     pixel bottomLeft, topRight;<BR> <BR>     // Get left and above reference column and row<BR>-    __m128i im0 = _mm_cvtsi32_si128(*(uint32_t*)&src[-srcStride]); // topRow<BR>-    __m128i v_topRow = _mm_unpacklo_epi8(im0, _mm_setzero_si128());<BR>+    __m128i im0 = _mm_cvtsi32_si128(*(int*)abov
 e); // topRow<BR>+    __m128i v_topRow = _mm_cvtepu8_epi16(im0);<BR> <BR>     v_topRow = _mm_shuffle_epi32(v_topRow, 0x44);<BR> <BR>     // Prepare intermediate variables used in interpolation<BR>-    bottomLeft = src[4 * srcStride - 1];<BR>-    topRight   = src[4 - srcStride];<BR>+    bottomLeft = left[4];<BR>+    topRight   = above[4];<BR> <BR>     __m128i v_bottomLeft = _mm_set1_epi16(bottomLeft);<BR>     __m128i v_bottomRow   = _mm_sub_epi16(v_bottomLeft, v_topRow);<BR>@@ -904,14 +913,14 @@<BR>     v_bottomRow = _mm_slli_epi16(v_bottomRow, 1);<BR> <BR> #define COMP_PRED_PLANAR_2ROW(Y) { \<BR>-        _tmp0 = _mm_cvtsi32_si128((src[((Y)) * srcStride - 1] << 2) + 4); \<BR>+       
  _tmp0 = _mm_cvtsi32_si128((left[(Y)] << 2) + 4); \<BR>         _tmp0 = _mm_shufflelo_epi16(_tmp0, 0); \<BR>-        _tmp1 = _mm_cvtsi32_si128((src[((Y)+1) * srcStride - 1] << 2) + 4); \<BR>+        _tmp1 = _mm_cvtsi32_si128((left[((Y)+1)] << 2) + 4); \<BR>         _tmp1 = _mm_shufflelo_epi16(_tmp1, 0); \<BR>         v_horPred = _mm_unpacklo_epi64(_tmp0, _tmp1); \<BR>-        _tmp0 = _mm_cvtsi32_si128(topRight - src[((Y)) * srcStride - 1]); \<BR>+        _tmp0 = _mm_cvtsi32_si128(topRight - left[(Y)]); \<BR>         _tmp0 = _mm_shufflelo_epi16(_tmp0, 0); \<BR>-        _tmp1 = _mm_cvtsi32_si128(topRight - src[((Y)+1) * srcStride - 1]); \
 <BR>+        _tmp1 = _mm_cvtsi32_si128(topRight - left[((Y)+1)]); \<BR>         _tmp1 = _mm_shufflelo_epi16(_tmp1, 0); \<BR>         v_rightColumnN = _mm_unpacklo_epi64(_tmp0, _tmp1); \<BR>         v_rightColumnN = _mm_mullo_epi16(v_rightColumnN, v_multi_2Row); \<BR>@@ -938,48 +947,38 @@<BR> #define COMP_PRED_PLANAR_ROW(X) { \<BR>         v_horPred = permute8s<X, X, X, X, X, X, X, X>(v_horPred4); \<BR>         v_rightColumnN = permute8s<X, X, X, X, X, X, X, X>(v_rightColumn) * v_multi; \<BR>-        v_horPred = v_horPred + v_rightColumnN; \<BR>-        v_topRow = v_topRow + v_bottomRow; \<BR>-        v_im4 = (Vec8s)(v_horPred + v_to
 pRow) >> (3 + shift); \<BR>-        store_partial(const_int(16), &dst[X * dstStride], v_im4); \<BR>-}<BR>-<BR>-void intra_pred_planar8(pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride)<BR>+        v_horPred = _mm_add_epi16(v_horPred, v_rightColumnN); \<BR>+        v_topRow = _mm_add_epi16(v_topRow, v_bottomRow); \<BR>+        v_im4 = _mm_srai_epi16(_mm_add_epi16(v_horPred, v_topRow), (3 + 1)); \<BR>+        _mm_storeu_si128((__m128i*)&dst[X * dstStride], v_im4); \<BR>+}<BR>+<BR>+void intra_pred_planar8(pixel* above, pixel* left, pixel* dst, intptr_t dstStride)<BR> {<BR>-    int k, bottomLeft, topRight;<BR>-<BR>-    int16_t leftColumn[8];<BR>+    int bottomLeft, topRight;<BR> <BR>     // Get left and above refe
 rence column and row<BR>-    Vec8s v_topRow = (Vec8s)load_partial(const_int(16), &src[-srcStride]); // topRow<BR>-<BR>-    for (k = 0; k < 8; k++)<BR>-    {<BR>-        leftColumn[k] = src[k * srcStride - 1];<BR>-    }<BR>-<BR>-    Vec8s v_leftColumn = (Vec8s)load_partial(const_int(16), leftColumn);   // leftColumn<BR>+    __m128i v_topRow = _mm_loadu_si128((__m128i*)above); // topRow<BR>+    __m128i v_leftColumn = _mm_loadu_si128((__m128i*)left); // leftColumn<BR> <BR>     // Prepare intermediate variables used in interpolation<BR>-    bottomLeft = src[8 * srcStride - 1];<BR>-    topRight   = src[8 - srcStride];<BR>-<BR>-    Vec8s v_bottomLeft(bottomLeft);<BR>-    Vec8s v_topRight(topRight);<BR>-<BR>-    Vec8s v_bottomR
 ow = v_bottomLeft - v_topRow;<BR>-    Vec8s v_rightColumn = v_topRight - v_leftColumn;<BR>-<BR>-    int shift = g_convertToBit[8];          // Using value corresponding to width = 8<BR>-    v_topRow = v_topRow << (2 + shift);<BR>-    v_leftColumn = v_leftColumn << (2 + shift);<BR>+    bottomLeft = left[8];<BR>+    topRight   = above[8];<BR>+<BR>+    __m128i v_bottomLeft = _mm_set1_epi16(bottomLeft);<BR>+    __m128i v_topRight = _mm_set1_epi16(topRight);<BR>+<BR>+    __m128i v_bottomRow = _mm_sub_epi16(v_bottomLeft, v_topRow);<BR>+    __m128i v_rightColumn = _mm_sub_epi16(v_topRight, v_leftColumn);<BR>+<BR>+    v_topRow = _mm_slli_epi16(v_topRow, (2 + 1));<BR>+    v_leftColumn = _mm_slli_epi16(v_leftColumn, (2 + 1));<BR> <BR> &nbsp
 ;   // Generate prediction signal<BR>-    Vec8s v_horPred4 = v_leftColumn + Vec8s(8);<BR>-    const Vec8s v_multi(1, 2, 3, 4, 5, 6, 7, 8);<BR>-    Vec8s v_horPred, v_rightColumnN;<BR>-    Vec8s v_im4;<BR>-    Vec16uc v_im5;<BR>+    __m128i v_horPred4 = _mm_add_epi16(v_leftColumn, _mm_set1_epi16(8));<BR>+    const __m128i v_multi = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);<BR>+    __m128i v_horPred, v_rightColumnN;<BR>+    __m128i v_im4;<BR> <BR>     COMP_PRED_PLANAR_ROW(0);     // row 0<BR>     COMP_PRED_PLANAR_ROW(1);<BR>@@ -1004,27 +1003,20 @@<BR>         store_partial(const_int(8), &dst[X * dstStride], v_im5); \<BR> }<BR> <BR>-void intra_pred_planar8(pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride)<BR>+void 
 intra_pred_planar8(pixel* above, pixel* left, pixel* dst, intptr_t dstStride)<BR> {<BR>-    int k;<BR>     pixel bottomLeft, topRight;<BR>-    int16_t leftColumn[8];<BR> <BR>     // Get left and above reference column and row<BR>-    Vec16uc im0 = (Vec16uc)load_partial(const_int(8), &src[-srcStride]); // topRow<BR>+    Vec16uc im0 = (Vec16uc)load_partial(const_int(8), (void*)above); // topRow<BR>     Vec8s v_topRow = extend_low(im0);<BR> <BR>-    for (k = 0; k < 8; k++)<BR>-    {<BR>-        leftColumn[k] = src[k * srcStride - 1];<BR>-    }<BR>-<BR>-    Vec8s v_leftColumn;<BR>-    v_leftColumn.load(leftColumn);   // leftColumn<BR>+    Vec8s v_leftColumn = _mm_loadl_epi64((__m128i*)left);   // leftColu
 mn<BR>+    v_leftColumn = _mm_unpacklo_epi8(v_leftColumn, _mm_setzero_si128());<BR> <BR>     // Prepare intermediate variables used in interpolation<BR>-    bottomLeft = src[8 * srcStride - 1];<BR>-    topRight   = src[8 - srcStride];<BR>+    bottomLeft = left[8];<BR>+    topRight   = above[8];<BR> <BR>     Vec8s v_bottomLeft(bottomLeft);<BR>     Vec8s v_topRight(topRight);<BR>@@ -1055,29 +1047,18 @@<BR> #undef COMP_PRED_PLANAR_ROW<BR> <BR> #if INSTRSET >= 5<BR>-void intra_pred_planar8_sse4(pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride)<BR>+void intra_pred_planar8_sse4(pixel* above, pixel* left, pixel* dst, intptr_t dstStride)<BR> {<BR>     pixel bottomLeft, topRight;<BR> <BR>     // Get left and above reference column and row<BR>- 
    __m128i im0 = _mm_loadl_epi64((__m128i*)&src[0 - srcStride]); // topRow<BR>-    __m128i v_topRow = _mm_unpacklo_epi8(im0, _mm_setzero_si128());<BR>-<BR>-    __m128i v_leftColumn = _mm_setzero_si128();<BR>-<BR>-    v_leftColumn = _mm_insert_epi8(v_leftColumn, src[0 * srcStride - 1], 0);<BR>-    v_leftColumn = _mm_insert_epi8(v_leftColumn, src[1 * srcStride - 1], 1);<BR>-    v_leftColumn = _mm_insert_epi8(v_leftColumn, src[2 * srcStride - 1], 2);<BR>-    v_leftColumn = _mm_insert_epi8(v_leftColumn, src[3 * srcStride - 1], 3);<BR>-    v_leftColumn = _mm_insert_epi8(v_leftColumn, src[4 * srcStride - 1], 4);<BR>-    v_leftColumn = _mm_insert_epi8(v_leftColumn, src[5 * srcStride - 1], 5);<BR>-    v_leftColumn = _mm_insert_epi8(v_leftColumn, src[6 * srcStride - 1], 6);<BR>-    v_leftColumn = _mm_insert_epi8(v_leftColumn, src[
 7 * srcStride - 1], 7);<BR>-    v_leftColumn = _mm_unpacklo_epi8(v_leftColumn, _mm_setzero_si128());<BR>+    __m128i v_topRow = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)above)); // topRow<BR>+<BR>+    __m128i v_leftColumn = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)left));<BR> <BR>     // Prepare intermediate variables used in interpolation<BR>-    bottomLeft = src[8 * srcStride - 1];<BR>-    topRight   = src[8 - srcStride];<BR>+    bottomLeft = left[8];<BR>+    topRight   = above[8];<BR> <BR>     __m128i v_bottomLeft = _mm_set1_epi16(bottomLeft);<BR>     __m128i v_topRight   = _mm_set1_epi16(topRight);<BR>@@ -1094,18 +1075,8 @@<BR>     __m128i v_im5;<BR> <BR> #define COMP_PRED_PLANAR_ROW(Y) { \<BR>-        if ((
 Y) < 4) { \<BR>-            v_horPred = _mm_shufflelo_epi16(v_horPred4, ((Y) & 3) * 0x55); \<BR>-            v_horPred = _mm_unpacklo_epi64(v_horPred, v_horPred); \<BR>-            v_rightColumnN = _mm_shufflelo_epi16(v_rightColumn, ((Y) & 3) * 0x55); \<BR>-            v_rightColumnN = _mm_unpacklo_epi64(v_rightColumnN, v_rightColumnN); \<BR>-        } \<BR>-        else { \<BR>-            v_horPred = _mm_shufflehi_epi16(v_horPred4, ((Y) & 3) * 0x55); \<BR>-            v_horPred = _mm_unpackhi_epi64(v_horPred, v_horPred); \<BR>-       &nbsp
 ;    v_rightColumnN = _mm_shufflehi_epi16(v_rightColumn, ((Y) & 3) * 0x55); \<BR>-            v_rightColumnN = _mm_unpackhi_epi64(v_rightColumnN, v_rightColumnN); \<BR>-        } \<BR>+        BROADCAST16(v_horPred4, (Y), v_horPred); \<BR>+        BROADCAST16(v_rightColumn, (Y), v_rightColumnN); \<BR>         v_rightColumnN = _mm_mullo_epi16(v_rightColumnN, v_multiL); \<BR>         v_horPred = _mm_add_epi16(v_horPred, v_rightColumnN); \<BR>         v_topRow = _mm_add_epi16(v_topRow, v_bottomRow); \<BR>@@ -1148,29 +1119,22 @@<BR>         v_im4_hi.store(&dst[X * dstStride + 8]); \<BR> }<BR> <BR>-void intra_pred_planar16(pixel* src, intptr
 _t srcStride, pixel* dst, intptr_t dstStride)<BR>+void intra_pred_planar16(pixel* above, pixel* left, pixel* dst, intptr_t dstStride)<BR> {<BR>-    int k;<BR>     pixel bottomLeft, topRight;<BR>-    int16_t leftColumn[16];<BR> <BR>     // Get left and above reference column and row<BR>     Vec8s v_topRow_lo, v_topRow_hi;<BR> <BR>-    v_topRow_lo.load(&src[-srcStride]);<BR>-    v_topRow_hi.load(&src[-srcStride + 8]);<BR>-<BR>-    for (k = 0; k < 16; k++)<BR>-    {<BR>-        leftColumn[k] = src[k * srcStride - 1];<BR>-    }<BR>+    v_topRow_lo.load(&above[0]);<BR>+    v_topRow_hi.load(&above[8]);<BR> <BR>     Vec8s v_leftColumn;<BR>-    v_leftColumn.load(leftColumn);   // le
 ftColumn<BR>+    v_leftColumn.load(left);   // leftColumn<BR> <BR>     // Prepare intermediate variables used in interpolation<BR>-    bottomLeft = src[16 * srcStride - 1];<BR>-    topRight   = src[16 - srcStride];<BR>+    bottomLeft = left[16];<BR>+    topRight   = above[16];<BR> <BR>     Vec8s v_bottomLeft(bottomLeft);<BR>     Vec8s v_topRight(topRight);<BR>@@ -1200,7 +1164,7 @@<BR>     COMP_PRED_PLANAR_ROW(6);<BR>     COMP_PRED_PLANAR_ROW(7);     // row 7<BR> <BR>-    v_leftColumn.load(leftColumn + 8);   // leftColumn lower 8 rows<BR>+    v_leftColumn.load(left + 8);   // leftColumn lower 8 rows<BR>     v_rightColumn = v_topRight - v_leftColumn;<BR>     v_
 leftColumn = v_leftColumn << (2 + shift);<BR>     v_horPred4 = v_leftColumn + Vec8s(16);<BR>@@ -1235,28 +1199,20 @@<BR>         store_partial(const_int(16), &dst[X * dstStride], v_im5); \<BR> }<BR> <BR>-void intra_pred_planar16(pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride)<BR>+void intra_pred_planar16(pixel* above, pixel* left, pixel* dst, intptr_t dstStride)<BR> {<BR>-    int k;<BR>     pixel bottomLeft, topRight;<BR>-    int16_t leftColumn[16];<BR> <BR>     // Get left and above reference column and row<BR>-    Vec16uc im0 = (Vec16uc)load_partial(const_int(16), &src[-srcStride]); // topRow<BR>+    Vec16uc im0 = (Vec16uc)load_partial(const_int(16), above); // topRow<BR>     Vec8s v_topRow_lo = extend_low(im0);<BR>     Vec8s v_topRow_h
 i = extend_high(im0);<BR> <BR>-    for (k = 0; k < 16; k++)<BR>-    {<BR>-        leftColumn[k] = src[k * srcStride - 1];<BR>-    }<BR>-<BR>-    Vec8s v_leftColumn;<BR>-    v_leftColumn.load(leftColumn);   // leftColumn<BR>+    Vec8s v_leftColumn = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)left), _mm_setzero_si128());<BR> <BR>     // Prepare intermediate variables used in interpolation<BR>-    bottomLeft = src[16 * srcStride - 1];<BR>-    topRight   = src[16 - srcStride];<BR>+    bottomLeft = left[16];<BR>+    topRight   = above[16];<BR> <BR>     Vec8s v_bottomLeft(bottomLeft);<BR>     Vec8s v_topRight(topRight);<BR>@@ -1286,7 +1242,8 @@<BR>     COMP_PRED_PLANAR_ROW(6);<BR>
      COMP_PRED_PLANAR_ROW(7);     // row 7<BR> <BR>-    v_leftColumn.load(leftColumn + 8);   // leftColumn lower 8 rows<BR>+    // leftColumn lower 8 rows<BR>+    v_leftColumn = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(left + 8)), _mm_setzero_si128());<BR>     v_rightColumn = v_topRight - v_leftColumn;<BR>     v_leftColumn = v_leftColumn << (2 + shift);<BR>     v_horPred4 = v_leftColumn + Vec8s(16);<BR>@@ -1304,21 +1261,21 @@<BR> #undef COMP_PRED_PLANAR_ROW<BR> <BR> #if INSTRSET >= 5<BR>-void intra_pred_planar16_sse4(pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride)<BR>+void intra_pred_planar16_sse4(pixel* above, pixel* left, pixel* dst, intptr_t dstStride)<BR> {<BR>     pixel bottomLeft, topRight;<BR>     __m128i v_topRow[2];<BR> &nbs
 p;   __m128i v_bottomRow[2];<BR> <BR>     // Get left and above reference column and row<BR>-    __m128i im0 = _mm_loadu_si128((__m128i*)&src[0 - srcStride]); // topRow<BR>+    __m128i im0 = _mm_loadu_si128((__m128i*)above); // topRow<BR> <BR>     v_topRow[0] = _mm_unpacklo_epi8(im0, _mm_setzero_si128());<BR>     v_topRow[1] = _mm_unpackhi_epi8(im0, _mm_setzero_si128());<BR> <BR>     // Prepare intermediate variables used in interpolation<BR>-    bottomLeft = src[16 * srcStride - 1];<BR>-    topRight   = src[16 - srcStride];<BR>+    bottomLeft = left[16];<BR>+    topRight   = above[16];<BR> <BR>     __m128i v_bottomLeft = _mm_set1_epi16(bottomLeft);<BR> <BR>@@ -1333,10 +1290,10 @@<BR>     __m128i v_im5;<BR> <BR>&n
 bsp;#define COMP_PRED_PLANAR_ROW(Y) { \<BR>-        v_horPred = _mm_cvtsi32_si128((src[(Y)*srcStride - 1] << 4) + 16); \<BR>+        v_horPred = _mm_cvtsi32_si128((left[(Y)] << 4) + 16); \<BR>         v_horPred = _mm_shufflelo_epi16(v_horPred, 0); \<BR>         v_horPred = _mm_shuffle_epi32(v_horPred, 0); \<BR>-        __m128i _tmp = _mm_cvtsi32_si128(topRight - src[(Y)*srcStride - 1]); \<BR>+        __m128i _tmp = _mm_cvtsi32_si128(topRight - left[(Y)]); \<BR>         _tmp = _mm_shufflelo_epi16(_tmp, 0); \<BR>         _tmp = _mm_shuffle_epi32(_tmp, 0); \<BR>         v_rightColumnN[0] = _mm_mullo_epi16(_tmp, v_multiL); \<BR>@@ -1374,15 +
 1331,15 @@<BR> #endif // INSTRSET >= 5<BR> <BR> #if INSTRSET >= 5<BR>-void intra_pred_planar32_sse4(pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride)<BR>+void intra_pred_planar32_sse4(pixel* above, pixel* left, pixel* dst, intptr_t dstStride)<BR> {<BR>     pixel bottomLeft, topRight;<BR>     __m128i v_topRow[4];<BR>     __m128i v_bottomRow[4];<BR> <BR>     // Get left and above reference column and row<BR>-    __m128i im0 = _mm_loadu_si128((__m128i*)&src[0 - srcStride]); // topRow<BR>-    __m128i im1 = _mm_loadu_si128((__m128i*)&src[16 - srcStride]); // topRow<BR>+    __m128i im0 = _mm_loadu_si128((__m128i*)&above[0]); // topRow<BR>+    __m128i im1 = _mm_loadu_si128((__m128i*)&above[16]); // topRow<BR> <BR>     v_topRow[0] = _mm_unpacklo_epi8(im0, _mm_setzero_s
 i128());<BR>     v_topRow[1] = _mm_unpackhi_epi8(im0, _mm_setzero_si128());<BR>@@ -1390,8 +1347,8 @@<BR>     v_topRow[3] = _mm_unpackhi_epi8(im1, _mm_setzero_si128());<BR> <BR>     // Prepare intermediate variables used in interpolation<BR>-    bottomLeft = src[32 * srcStride - 1];<BR>-    topRight   = src[32 - srcStride];<BR>+    bottomLeft = left[32];<BR>+    topRight   = above[32];<BR> <BR>     __m128i v_bottomLeft = _mm_set1_epi16(bottomLeft);<BR> <BR>@@ -1410,10 +1367,10 @@<BR>     __m128i v_im5[2];<BR> <BR> #define COMP_PRED_PLANAR_ROW(Y) { \<BR>-        v_horPred = _mm_cvtsi32_si128((src[(Y)*srcStride - 1] << 5) + 32); \<BR>+        v_horPred = _mm_cvtsi32_si128((left[(Y)] << 5) + 32); \<BR>&nb
 sp;        v_horPred = _mm_shufflelo_epi16(v_horPred, 0); \<BR>         v_horPred = _mm_shuffle_epi32(v_horPred, 0); \<BR>-        __m128i _tmp = _mm_cvtsi32_si128(topRight - src[(Y)*srcStride - 1]); \<BR>+        __m128i _tmp = _mm_cvtsi32_si128(topRight - left[(Y)]); \<BR>         _tmp = _mm_shufflelo_epi16(_tmp, 0); \<BR>         _tmp = _mm_shuffle_epi32(_tmp, 0); \<BR>         v_rightColumnN[0] = _mm_mullo_epi16(_tmp, v_multiL); \<BR>@@ -1451,17 +1408,17 @@<BR> #endif // INSTRSET >= 5<BR> <BR> #if INSTRSET >= 5<BR>-void intra_pred_planar64_sse4(pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride)<BR>+void intra_pred_planar64_sse4(pixel* above, pixel* left, pixel* dst, intptr_t dstStride)
 <BR> {<BR>     pixel bottomLeft, topRight;<BR>     __m128i v_topRow[8];<BR>     __m128i v_bottomRow[8];<BR> <BR>     // Get left and above reference column and row<BR>-    __m128i im0 = _mm_loadu_si128((__m128i*)&src[0 - srcStride]); // topRow<BR>-    __m128i im1 = _mm_loadu_si128((__m128i*)&src[16 - srcStride]); // topRow<BR>-    __m128i im2 = _mm_loadu_si128((__m128i*)&src[32 - srcStride]); // topRow<BR>-    __m128i im3 = _mm_loadu_si128((__m128i*)&src[48 - srcStride]); // topRow<BR>+    __m128i im0 = _mm_loadu_si128((__m128i*)&above[0]); // topRow<BR>+    __m128i im1 = _mm_loadu_si128((__m128i*)&above[16]); // topRow<BR>+    __m128i im2 = _mm_loadu_si128((__m128i*)&above[32]); // topRow<BR>+    __m128i im3 = _mm_loadu_si128((__m128i*)&above[4
 8]); // topRow<BR> <BR>     v_topRow[0] = _mm_unpacklo_epi8(im0, _mm_setzero_si128());<BR>     v_topRow[1] = _mm_unpackhi_epi8(im0, _mm_setzero_si128());<BR>@@ -1473,8 +1430,8 @@<BR>     v_topRow[7] = _mm_unpackhi_epi8(im3, _mm_setzero_si128());<BR> <BR>     // Prepare intermediate variables used in interpolation<BR>-    bottomLeft = src[64 * srcStride - 1];<BR>-    topRight   = src[64 - srcStride];<BR>+    bottomLeft = left[64];<BR>+    topRight   = above[64];<BR> <BR>     __m128i v_bottomLeft = _mm_set1_epi16(bottomLeft);<BR> <BR>@@ -1501,10 +1458,10 @@<BR>     __m128i v_im5[4];<BR> <BR> #define COMP_PRED_PLANAR_ROW(Y) { \<BR>-        v_horPred = _mm_cvtsi32_si128((src[(Y)*srcStride - 1] << 6) + 64); \<BR>+ &n
 bsp;      v_horPred = _mm_cvtsi32_si128((left[(Y)] << 6) + 64); \<BR>         v_horPred = _mm_shufflelo_epi16(v_horPred, 0); \<BR>         v_horPred = _mm_shuffle_epi32(v_horPred, 0); \<BR>-        __m128i _tmp = _mm_cvtsi32_si128(topRight - src[(Y)*srcStride - 1]); \<BR>+        __m128i _tmp = _mm_cvtsi32_si128(topRight - left[(Y)]); \<BR>         _tmp = _mm_shufflelo_epi16(_tmp, 0); \<BR>         _tmp = _mm_shuffle_epi32(_tmp, 0); \<BR>         v_rightColumnN[0] = _mm_mullo_epi16(_tmp, v_multiL); \<BR>@@ -1563,7 +1520,7 @@<BR> <BR> #endif /* if HIGH_BIT_DEPTH */<BR> <BR>-typedef void intra_pred_planar_t (pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride);<BR
 >+typedef void intra_pred_planar_t (pixel* above, pixel* left, pixel* dst, intptr_t dstStride);<BR> intra_pred_planar_t *intraPlanarN[] =<BR> {<BR> #if !HIGH_BIT_DEPTH && INSTRSET >= 5<BR>@@ -1579,18 +1536,18 @@<BR> #endif<BR> };<BR> <BR>-void intra_pred_planar(pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width)<BR>+void intra_pred_planar(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int width)<BR> {<BR>     int nLog2Size = g_convertToBit[width] + 2;<BR> <BR> #if (!HIGH_BIT_DEPTH) && (INSTRSET >= 5)<BR>-    intraPlanarN[nLog2Size - 2](src, srcStride, dst, dstStride);<BR>+    intraPlanarN[nLog2Size - 2](above, left, dst, dstStride);<BR>     return;<BR> #else<BR>     int k, l, bottomLeft, topRight;<BR>     int horPred;<BR>     // OPT_ME: when width is
  64, the shift1D is 8, then the dynamic range is [-65280, 65280], so we have to use 32 bits here<BR>-    int32_t leftColumn[MAX_CU_SIZE + 1], topRow[MAX_CU_SIZE + 1];<BR>+    int32_t leftColumn[MAX_CU_SIZE], topRow[MAX_CU_SIZE];<BR>     // CHECK_ME: dynamic range is 9 bits or 15 bits(I assume max input bit_depth is 14 bits)<BR>     int16_t bottomRow[MAX_CU_SIZE], rightColumn[MAX_CU_SIZE];<BR>     int blkSize = width;<BR>@@ -1600,20 +1557,20 @@<BR> <BR>     if (width < 32)<BR>     {<BR>-        intraPlanarN[nLog2Size - 2](src, srcStride, dst, dstStride);<BR>+        intraPlanarN[nLog2Size - 2](above, left, dst, dstStride);<BR>         return;<BR>     }<BR> <BR>     // Get left and above reference
  column and row<BR>-    for (k = 0; k < blkSize + 1; k++)<BR>+    for (k = 0; k < blkSize; k++)<BR>     {<BR>-        topRow[k] = src[k - srcStride];<BR>-        leftColumn[k] = src[k * srcStride - 1];<BR>+        topRow[k] = above[k];<BR>+        leftColumn[k] = left[k];<BR>     }<BR> <BR>     // Prepare intermediate variables used in interpolation<BR>-    bottomLeft = leftColumn[blkSize];<BR>-    topRight   = topRow[blkSize];<BR>+    bottomLeft = left[blkSize];<BR>+    topRight   = above[blkSize];<BR>     for (k = 0; k < blkSize; k++)<BR>     {<BR>         bottomRow[k]   = b
 ottomLeft - topRow[k];<BR>diff -r f813f110d69a -r a4013cdafef0 source/encoder/compress.cpp<BR>--- a/source/encoder/compress.cpp Thu Jul 18 02:10:37 2013 -0500<BR>+++ b/source/encoder/compress.cpp Thu Jul 18 15:52:51 2013 +0800<BR>@@ -123,23 +123,25 @@<BR>         CandNum = 0;<BR>         UInt modeCosts[35];<BR>         Bool bFilter = (width <= 16);<BR>-        Pel *src = m_search->getPredicBuf();<BR> <BR>         Pel *pAbove0 = m_search->refAbove    + width - 1;<BR>         Pel *pAbove1 = m_search->refAboveFlt + width - 1;<BR>         Pel *pLeft0  = m_search->refLeft     + width - 1;<BR>         Pel *pLef
 t1  = m_search->refLeftFlt  + width - 1;<BR>+        Pel *pAbove  = pAbove0;<BR>+        Pel *pLeft   = pLeft0;<BR> <BR>         // 1<BR>-        primitives.intra_pred_dc(pAbove0 + 1, pLeft0 + 1, pred, stride, width, bFilter);<BR>+        primitives.intra_pred_dc((pixel*)pAbove0 + 1, (pixel*)pLeft0 + 1, pred, stride, width, bFilter);<BR>         modeCosts[DC_IDX] = sa8d(fenc, stride, pred, stride);<BR> <BR>         // 0<BR>         if (width >= 8 && width <= 32)<BR>         {<BR>-            src += ADI_BUF_STRIDE * (2 * width + 1);<BR>+ &n
 bsp;          pAbove = pAbove1;<BR>+            pLeft  = pLeft1;<BR>         }<BR>-        primitives.intra_pred_planar(src + ADI_BUF_STRIDE + 1, ADI_BUF_STRIDE, pred, stride, width);<BR>+        primitives.intra_pred_planar(pAbove + 1, pLeft + 1, pred, stride, width);<BR>         modeCosts[PLANAR_IDX] = sa8d(fenc, stride, pred, stride);<BR> <BR>         // 33 Angle modes once<BR>diff -r f813f110d69a -r a4013cdafef0 source/test/intrapredharness.cpp<BR>--- a/source/test/intrapredharness.cpp Thu Jul 18 02:10:37 2013 -0500<BR>+++ b/source/test/intrapredharness.cpp Thu Jul 18 15:52:51 2013 +0800<BR>@@ -117,17 +117,26 @@<BR>     {<BR>     
     for (int i = 0; i <= 100; i++)<BR>         {<BR>+            pixel left[MAX_CU_SIZE * 2 + 1];<BR>+            for (int k = 0; k < width * 2 + 1; k++)<BR>+            {<BR>+                left[k] = pixel_buff[j - 1 + k * ADI_BUF_STRIDE];<BR>+            }<BR> #if _DEBUG<BR>             memset(pixel_out_vec, 0xCD, out_size);<BR>             memset(pixel_out_c, 0xCD, out_size);<BR> #endif<BR>-            ref(pixel_buff + j, ADI_BUF_STRIDE, pixel_out_
 c,   FENC_STRIDE, width);<BR>-            opt(pixel_buff + j, ADI_BUF_STRIDE, pixel_out_vec, FENC_STRIDE, width);<BR>+            ref(pixel_buff + j - ADI_BUF_STRIDE, left + 1, pixel_out_c,   FENC_STRIDE, width);<BR>+            opt(pixel_buff + j - ADI_BUF_STRIDE, left + 1, pixel_out_vec, FENC_STRIDE, width);<BR> <BR>             for (int k = 0; k < width; k++)<BR>             {<BR>                 if (memcmp(pixel_out_vec + k * FENC_STRIDE, pixel_out_c + k * FENC_STRIDE, width))<BR>                 {<BR>+#if _DEBUG<B
 R>+                    ref(pixel_buff + j - ADI_BUF_STRIDE, left + 1, pixel_out_c,   FENC_STRIDE, width);<BR>+                    opt(pixel_buff + j - ADI_BUF_STRIDE, left + 1, pixel_out_vec, FENC_STRIDE, width);<BR>+#endif<BR>                     return false;<BR>                 }<BR>             }<BR>@@ -292,7 +301,7 @@<BR>             width = ii;<BR>             printf("intra_planar%2dx%d", ii, ii);<BR>    &nb
 sp;        REPORT_SPEEDUP(opt.intra_pred_planar, ref.intra_pred_planar,<BR>-                           pixel_buff + srcStride, srcStride, pixel_out_vec, FENC_STRIDE, width);<BR>+                           pixel_buff + srcStride, pixel_buff, pixel_out_vec, FENC_STRIDE, width);<BR>         }<BR>     }<BR>     if (opt.intra_pred_ang)<BR></P>
<DIV></DIV></div>