<div style="line-height:1.7;color:#000000;font-size:14px;font-family:arial"><P># HG changeset patch<BR># User Min Chen <<A href="mailto:chenm003@163.com">chenm003@163.com</A>><BR># Date 1374133971 -28800<BR># Node ID a4013cdafef00502efe0d496dcb3c4f2bce966a4<BR># Parent f813f110d69a1a6650e813dd4e612216982a0264<BR>intrapred: improvement intra_pred_planar</P>
<P>diff -r f813f110d69a -r a4013cdafef0 source/Lib/TLibCommon/TComPrediction.cpp<BR>--- a/source/Lib/TLibCommon/TComPrediction.cpp Thu Jul 18 02:10:37 2013 -0500<BR>+++ b/source/Lib/TLibCommon/TComPrediction.cpp Thu Jul 18 15:52:51 2013 +0800<BR>@@ -177,17 +177,16 @@<BR> }<BR> <BR> // get starting pixel in block<BR>- Int sw = ADI_BUF_STRIDE;<BR> Bool bFilter = (size <= 16);<BR> <BR> // Create the prediction<BR> if (dirMode == PLANAR_IDX)<BR> {<BR>- primitives.intra_pred_planar(src + sw + 1, sw, dst, stride, size);<BR>+ primitives.intra_pred_planar((pixel*)refAbv + 1, (pixel*)refLft + 1, (pixel*)dst, stride, size);<BR> }<BR> else if (dirMode == DC_IDX)<BR>
{<BR>- primitives.intra_pred_dc(refAbv + 1, refLft + 1, dst, stride, size, bFilter);<BR>+ primitives.intra_pred_dc((pixel*)refAbv + 1, (pixel*)refLft + 1, (pixel*)dst, stride, size, bFilter);<BR> }<BR> else<BR> {<BR>@@ -198,33 +197,28 @@<BR> // Angular chroma<BR> Void TComPrediction::predIntraChromaAng(Pel* src, UInt dirMode, Pel* dst, UInt stride, Int width)<BR> {<BR>+ // Create the prediction<BR>+ Pel refAbv[3 * MAX_CU_SIZE];<BR>+ Pel refLft[3 * MAX_CU_SIZE];<BR>+ int limit = (dirMode <= 25 && dirMode >= 11) ? (width + 1 + 1) : (2 * width + 1);<BR>+ memcpy(refAbv + width - 1, src, (limit) * sizeof(Pel));<BR>+ for (int k = 0; k < limit; k++)<BR>+ {<BR>+ &n
bsp; refLft[k + width - 1] = src[k * ADI_BUF_STRIDE];<BR>+ }<BR>+<BR> // get starting pixel in block<BR>- Int sw = ADI_BUF_STRIDE;<BR>-<BR> if (dirMode == PLANAR_IDX)<BR> {<BR>- primitives.intra_pred_planar(src + sw + 1, sw, dst, stride, width);<BR>+ primitives.intra_pred_planar((pixel*)refAbv + width - 1 + 1, (pixel*)refLft + width - 1 + 1, (pixel*)dst, stride, width);<BR>+ }<BR>+ else if (dirMode == DC_IDX)<BR>+ {<BR>+ primitives.intra_pred_dc(refAbv + width - 1 + 1, refLft + width - 1 + 1, dst, stride, width, false);<BR> }<BR> else<BR> {<BR>- // Create the predi
ction<BR>- Pel refAbv[3 * MAX_CU_SIZE];<BR>- Pel refLft[3 * MAX_CU_SIZE];<BR>- int limit = (dirMode <= 25 && dirMode >= 11) ? (width + 1) : (2 * width + 1);<BR>- memcpy(refAbv + width - 1, src, (limit) * sizeof(Pel));<BR>- for (int k = 0; k < limit; k++)<BR>- {<BR>- refLft[k + width - 1] = src[k * sw];<BR>- }<BR>-<BR>- if (dirMode == DC_IDX)<BR>- {<BR>- primitives.intra_pred_dc(refAbv + width - 1 + 1, refLft + width - 1 + 1, dst, stride, width, false);<BR>- &nb
sp; }<BR>- else<BR>- {<BR>- primitives.intra_pred_ang(dst, stride, width, dirMode, false, refLft + width - 1, refAbv + width - 1);<BR>- }<BR>+ primitives.intra_pred_ang(dst, stride, width, dirMode, false, refLft + width - 1, refAbv + width - 1);<BR> }<BR> }<BR> <BR>diff -r f813f110d69a -r a4013cdafef0 source/Lib/TLibEncoder/TEncSearch.cpp<BR>--- a/source/Lib/TLibEncoder/TEncSearch.cpp Thu Jul 18 02:10:37 2013 -0500<BR>+++ b/source/Lib/TLibEncoder/TEncSearch.cpp Thu Jul 18 15:52:51 2013 +0800<BR>@@ -1975,6 +1975,8 @@<BR> Pel *pAbove1 = refAboveFlt + width - 1;<BR>
Pel *pLeft0 = refLeft + width - 1;<BR> Pel *pLeft1 = refLeftFlt + width - 1;<BR>+ Pel *above = pAbove0;<BR>+ Pel *left = pLeft0;<BR> <BR> // 1<BR> primitives.intra_pred_dc(pAbove0 + 1, pLeft0 + 1, pred, stride, width, bFilter);<BR>@@ -1984,8 +1986,10 @@<BR> if (width >= 8 && width <= 32)<BR> {<BR> &
nbsp; predSrc += ADI_BUF_STRIDE * (2 * width + 1);<BR>+ above = pAbove1;<BR>+ left = pLeft1;<BR> }<BR>- primitives.intra_pred_planar(predSrc + ADI_BUF_STRIDE + 1, ADI_BUF_STRIDE, pred, stride, width);<BR>+ primitives.intra_pred_planar((pixel*)above + 1, (pixel*)left + 1, pred, stride, width);<BR> modeCosts[PLANAR_IDX] = sa8d(fenc, stride, pred, stride);<BR> <BR> // 33 Angle modes once<BR>diff -r f813f110d69a -r a4013cdafef0 source/common/intrapred.cpp<BR
>--- a/source/common/intrapred.cpp Thu Jul 18 02:10:37 2013 -0500<BR>+++ b/source/common/intrapred.cpp Thu Jul 18 15:52:51 2013 +0800<BR>@@ -98,7 +98,7 @@<BR> }<BR> }<BR> <BR>-void PredIntraPlanar(pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width)<BR>+void PredIntraPlanar(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int width)<BR> {<BR> //assert(width == height);<BR> <BR>@@ -117,8 +117,8 @@<BR> // Get left and above reference column and row<BR> for (k = 0; k < blkSize + 1; k++)<BR> {<BR>- topRow[k] = src[k - srcStride];<BR>- leftColumn[k] = src[k * srcStride - 1];<BR>+ topRow[k] = above[k];<BR>+ leftColumn[k] = left[k];<BR> &
nbsp; }<BR> <BR> // Prepare intermediate variables used in interpolation<BR>diff -r f813f110d69a -r a4013cdafef0 source/common/primitives.h<BR>--- a/source/common/primitives.h Thu Jul 18 02:10:37 2013 -0500<BR>+++ b/source/common/primitives.h Thu Jul 18 15:52:51 2013 +0800<BR>@@ -195,7 +195,7 @@<BR> typedef void (*pixelsub_sp_t)(int bx, int by, short *dst, intptr_t dstride, pixel *src0, pixel *src1, intptr_t sstride0, intptr_t sstride1); <BR> <BR> typedef void (*intra_dc_t)(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int width, int bFilter);<BR>-typedef void (*intra_planar_t)(pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width);<BR>+typedef void (*intra_planar_t)(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int width);<BR> typedef void (*intra_ang_t)(pixel* dst, int dstStride, int width, int dirMode, bool bFilter, pixel *refLeft, pixel *refAbove);<BR> typede
f void (*intra_allangs_t)(pixel *dst, pixel *above0, pixel *left0, pixel *above1, pixel *left1, bool bLuma);<BR> <BR>diff -r f813f110d69a -r a4013cdafef0 source/common/vec/intrapred.inc<BR>--- a/source/common/vec/intrapred.inc Thu Jul 18 02:10:37 2013 -0500<BR>+++ b/source/common/vec/intrapred.inc Thu Jul 18 15:52:51 2013 +0800<BR>@@ -746,27 +746,42 @@<BR> #endif // if HIGH_BIT_DEPTH<BR> }<BR> <BR>+#if INSTRSET >= 4 // SSSE3<BR>+ #define BROADCAST16(a, d, x) { \<BR>+ const __m128i mask = _mm_set1_epi16( (((d) * 2) | ((d) * 2 + 1) << 8) ); \<BR>+ (x) = _mm_shuffle_epi8((a), mask); \<BR>+ }<BR>+#else<BR>+ #define BROADCAST16(a, d, x) { \<BR>+ const int dL = (d) & 3; \<BR>+ const int dH = ((d)-4) & 3; \<BR>+ &n
bsp; if (d>=4) { \<BR>+ (x) = _mm_shufflehi_epi16((a), dH * 0x55); \<BR>+ (x) = _mm_unpackhi_epi64((x), (x)); \<BR>+ } \<BR>+ else { \<BR>+ (x) = _mm_shufflelo_epi16((a), dL * 0x55); \<BR>+ (x) = _mm_unpacklo_epi64((x), (x)); \<BR>+ } \<BR>+ }<BR>+#endif<BR>+<BR>+<BR> #if HIGH_BIT_DEPTH<BR>-// CHECK_ME: I am not sure the v_rightColumnN will be overflow when input as 12bpp<BR>-void intra_pred_planar4(pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride)<BR>+// CHECK_ME: I am not sure the v_rightColumnN will be overflow when input is 12b
pp<BR>+void intra_pred_planar4(pixel* above, pixel* left, pixel* dst, intptr_t dstStride)<BR> {<BR>- int k, bottomLeft, topRight;<BR>+ int bottomLeft, topRight;<BR> // NOTE: I use 16-bits is enough here, because we have least than 13-bits as input, and shift left by 2, it is 15-bits<BR>- int16_t leftColumn[4];<BR> <BR> // Get left and above reference column and row<BR>- Vec8s v_topRow = (Vec8s)load_partial(const_int(8), &src[-srcStride]); // topRow<BR>-<BR>- for (k = 0; k < 4; k++)<BR>- {<BR>- leftColumn[k] = src[k * srcStride - 1];<BR>- }<BR>-<BR>- Vec8s v_leftColumn = (Vec8s)load_partial(const_int(8), leftColumn); // leftColumn<BR>+ Vec8s v_topRow = (Vec8s)load_partial(const_int(8), above); // topRow<BR>+
<BR>+ Vec8s v_leftColumn = (Vec8s)load_partial(const_int(8), left); // leftColumn<BR> <BR> // Prepare intermediate variables used in interpolation<BR>- bottomLeft = src[4 * srcStride - 1];<BR>- topRight = src[4 - srcStride];<BR>+ bottomLeft = left[4];<BR>+ topRight = above[4];<BR> <BR> Vec8s v_bottomLeft(bottomLeft);<BR> Vec8s v_topRight(topRight);<BR>@@ -819,51 +834,45 @@<BR> }<BR> <BR> #else /* if HIGH_BIT_DEPTH */<BR>-void intra_pred_planar4(pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride)<BR>+void intra_pred_planar4(pixel* above, pixel* left, pixel* dst, intptr_t dstStride)<BR> {<BR>- int k;<BR> pixel bottomLeft, topRight;<BR> <BR> // Get left and above reference column
and row<BR>- Vec16uc im0 = (Vec16uc)load_partial(const_int(4), &src[-srcStride]); // topRow<BR>- Vec8s v_topRow = extend_low(im0);<BR>-<BR>- int16_t leftColumn[4];<BR>-<BR>- for (k = 0; k < 4; k++)<BR>- {<BR>- leftColumn[k] = src[k * srcStride - 1];<BR>- }<BR>-<BR>- Vec8s v_leftColumn = (Vec8s)load_partial(const_int(8), (void*)leftColumn); // leftColumn<BR>+ __m128i im0 = _mm_cvtsi32_si128(*(int*)above); // topRow<BR>+ __m128i v_topRow = _mm_unpacklo_epi8(im0, _mm_setzero_si128());<BR>+<BR>+ __m128i v_leftColumn = _mm_cvtsi32_si128(*(int*)left); // leftColumn<BR>+ v_leftColumn = _mm_unpacklo_epi8(v_leftColumn, _mm_setzero_si128());<BR> <BR> // Prepare intermediate variables used in interpolatio
n<BR>- bottomLeft = src[4 * srcStride - 1];<BR>- topRight = src[4 - srcStride];<BR>-<BR>- Vec8s v_bottomLeft(bottomLeft);<BR>- Vec8s v_topRight(topRight);<BR>-<BR>- Vec8s v_bottomRow = v_bottomLeft - v_topRow;<BR>- Vec8s v_rightColumn = v_topRight - v_leftColumn;<BR>-<BR>- v_topRow = v_topRow << const_int(2);<BR>- v_leftColumn = v_leftColumn << const_int(2);<BR>-<BR>- Vec8s v_horPred4 = v_leftColumn + Vec8s(4);<BR>- const Vec8s v_multi(1, 2, 3, 4, 5, 6, 7, 8);<BR>- Vec8s v_horPred, v_rightColumnN;<BR>- Vec8s v_im4;<BR>- Vec16uc v_im5;<BR>+ bottomLeft = left[4];<BR>+ topRight = above[4];<BR>+<BR>+ __m128i v_bottomLeft = _mm_set1_epi16(bottomLeft);<BR>+ &n
bsp; __m128i v_topRight = _mm_set1_epi16(topRight);<BR>+<BR>+ __m128i v_bottomRow = _mm_sub_epi16(v_bottomLeft, v_topRow);<BR>+ __m128i v_rightColumn = _mm_sub_epi16(v_topRight, v_leftColumn);<BR>+<BR>+ v_topRow = _mm_slli_epi16(v_topRow, 2);<BR>+ v_leftColumn = _mm_slli_epi16(v_leftColumn, 2);<BR>+<BR>+ __m128i v_horPred4 = _mm_add_epi16(v_leftColumn, _mm_set1_epi16(4));<BR>+ const __m128i v_multi = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);<BR>+ __m128i v_horPred, v_rightColumnN;<BR>+ __m128i v_im4;<BR>+ __m128i v_im5;<BR> <BR> #define COMP_PRED_PLANAR4_ROW(X) { \<BR>- v_horPred = broadcast(const_int((X)), v_horPred4); \<BR>- v_rightColumnN = broadcast(const_int((X)), v_rightColumn) * v_multi; \<BR>- &nb
sp; v_horPred = v_horPred + v_rightColumnN; \<BR>- v_topRow = v_topRow + v_bottomRow; \<BR>- v_im4 = (Vec8s)(v_horPred + v_topRow) >> const_int(3); \<BR>- v_im5 = compress_unsafe(v_im4, v_im4); \<BR>- store_partial(const_int(4), &dst[(X)*dstStride], v_im5); \<BR>+ BROADCAST16(v_horPred4, (X), v_horPred); \<BR>+ BROADCAST16(v_rightColumn, (X), v_rightColumnN); \<BR>+ v_rightColumnN = _mm_mullo_epi16(v_rightColumnN, v_multi); \<BR>+ v_horPred = _mm_add_epi16(v_horPred, v_rightColumnN); \<BR>+ v_topRow = _mm_add_epi16(v_topRow, v_bottomRow); \<BR>+ v_im4 = _
mm_srai_epi16(_mm_add_epi16(v_horPred, v_topRow), 3); \<BR>+ v_im5 = _mm_packus_epi16(v_im4, v_im4); \<BR>+ *(int*)&dst[(X)*dstStride] = _mm_cvtsi128_si32(v_im5); \<BR> }<BR> <BR> COMP_PRED_PLANAR4_ROW(0)<BR>@@ -875,19 +884,19 @@<BR> }<BR> <BR> #if INSTRSET >= 5<BR>-void intra_pred_planar4_sse4(pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride)<BR>+void intra_pred_planar4_sse4(pixel* above, pixel* left, pixel* dst, intptr_t dstStride)<BR> {<BR> pixel bottomLeft, topRight;<BR> <BR> // Get left and above reference column and row<BR>- __m128i im0 = _mm_cvtsi32_si128(*(uint32_t*)&src[-srcStride]); // topRow<BR>- __m128i v_topRow = _mm_unpacklo_epi8(im0, _mm_setzero_si128());<BR>+ __m128i im0 = _mm_cvtsi32_si128(*(int*)abov
e); // topRow<BR>+ __m128i v_topRow = _mm_cvtepu8_epi16(im0);<BR> <BR> v_topRow = _mm_shuffle_epi32(v_topRow, 0x44);<BR> <BR> // Prepare intermediate variables used in interpolation<BR>- bottomLeft = src[4 * srcStride - 1];<BR>- topRight = src[4 - srcStride];<BR>+ bottomLeft = left[4];<BR>+ topRight = above[4];<BR> <BR> __m128i v_bottomLeft = _mm_set1_epi16(bottomLeft);<BR> __m128i v_bottomRow = _mm_sub_epi16(v_bottomLeft, v_topRow);<BR>@@ -904,14 +913,14 @@<BR> v_bottomRow = _mm_slli_epi16(v_bottomRow, 1);<BR> <BR> #define COMP_PRED_PLANAR_2ROW(Y) { \<BR>- _tmp0 = _mm_cvtsi32_si128((src[((Y)) * srcStride - 1] << 2) + 4); \<BR>+
_tmp0 = _mm_cvtsi32_si128((left[(Y)] << 2) + 4); \<BR> _tmp0 = _mm_shufflelo_epi16(_tmp0, 0); \<BR>- _tmp1 = _mm_cvtsi32_si128((src[((Y)+1) * srcStride - 1] << 2) + 4); \<BR>+ _tmp1 = _mm_cvtsi32_si128((left[((Y)+1)] << 2) + 4); \<BR> _tmp1 = _mm_shufflelo_epi16(_tmp1, 0); \<BR> v_horPred = _mm_unpacklo_epi64(_tmp0, _tmp1); \<BR>- _tmp0 = _mm_cvtsi32_si128(topRight - src[((Y)) * srcStride - 1]); \<BR>+ _tmp0 = _mm_cvtsi32_si128(topRight - left[(Y)]); \<BR> _tmp0 = _mm_shufflelo_epi16(_tmp0, 0); \<BR>- _tmp1 = _mm_cvtsi32_si128(topRight - src[((Y)+1) * srcStride - 1]); \
<BR>+ _tmp1 = _mm_cvtsi32_si128(topRight - left[((Y)+1)]); \<BR> _tmp1 = _mm_shufflelo_epi16(_tmp1, 0); \<BR> v_rightColumnN = _mm_unpacklo_epi64(_tmp0, _tmp1); \<BR> v_rightColumnN = _mm_mullo_epi16(v_rightColumnN, v_multi_2Row); \<BR>@@ -938,48 +947,38 @@<BR> #define COMP_PRED_PLANAR_ROW(X) { \<BR> v_horPred = permute8s<X, X, X, X, X, X, X, X>(v_horPred4); \<BR> v_rightColumnN = permute8s<X, X, X, X, X, X, X, X>(v_rightColumn) * v_multi; \<BR>- v_horPred = v_horPred + v_rightColumnN; \<BR>- v_topRow = v_topRow + v_bottomRow; \<BR>- v_im4 = (Vec8s)(v_horPred + v_to
pRow) >> (3 + shift); \<BR>- store_partial(const_int(16), &dst[X * dstStride], v_im4); \<BR>-}<BR>-<BR>-void intra_pred_planar8(pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride)<BR>+ v_horPred = _mm_add_epi16(v_horPred, v_rightColumnN); \<BR>+ v_topRow = _mm_add_epi16(v_topRow, v_bottomRow); \<BR>+ v_im4 = _mm_srai_epi16(_mm_add_epi16(v_horPred, v_topRow), (3 + 1)); \<BR>+ _mm_storeu_si128((__m128i*)&dst[X * dstStride], v_im4); \<BR>+}<BR>+<BR>+void intra_pred_planar8(pixel* above, pixel* left, pixel* dst, intptr_t dstStride)<BR> {<BR>- int k, bottomLeft, topRight;<BR>-<BR>- int16_t leftColumn[8];<BR>+ int bottomLeft, topRight;<BR> <BR> // Get left and above refe
rence column and row<BR>- Vec8s v_topRow = (Vec8s)load_partial(const_int(16), &src[-srcStride]); // topRow<BR>-<BR>- for (k = 0; k < 8; k++)<BR>- {<BR>- leftColumn[k] = src[k * srcStride - 1];<BR>- }<BR>-<BR>- Vec8s v_leftColumn = (Vec8s)load_partial(const_int(16), leftColumn); // leftColumn<BR>+ __m128i v_topRow = _mm_loadu_si128((__m128i*)above); // topRow<BR>+ __m128i v_leftColumn = _mm_loadu_si128((__m128i*)left); // leftColumn<BR> <BR> // Prepare intermediate variables used in interpolation<BR>- bottomLeft = src[8 * srcStride - 1];<BR>- topRight = src[8 - srcStride];<BR>-<BR>- Vec8s v_bottomLeft(bottomLeft);<BR>- Vec8s v_topRight(topRight);<BR>-<BR>- Vec8s v_bottomR
ow = v_bottomLeft - v_topRow;<BR>- Vec8s v_rightColumn = v_topRight - v_leftColumn;<BR>-<BR>- int shift = g_convertToBit[8]; // Using value corresponding to width = 8<BR>- v_topRow = v_topRow << (2 + shift);<BR>- v_leftColumn = v_leftColumn << (2 + shift);<BR>+ bottomLeft = left[8];<BR>+ topRight = above[8];<BR>+<BR>+ __m128i v_bottomLeft = _mm_set1_epi16(bottomLeft);<BR>+ __m128i v_topRight = _mm_set1_epi16(topRight);<BR>+<BR>+ __m128i v_bottomRow = _mm_sub_epi16(v_bottomLeft, v_topRow);<BR>+ __m128i v_rightColumn = _mm_sub_epi16(v_topRight, v_leftColumn);<BR>+<BR>+ v_topRow = _mm_slli_epi16(v_topRow, (2 + 1));<BR>+ v_leftColumn = _mm_slli_epi16(v_leftColumn, (2 + 1));<BR> <BR>  
; // Generate prediction signal<BR>- Vec8s v_horPred4 = v_leftColumn + Vec8s(8);<BR>- const Vec8s v_multi(1, 2, 3, 4, 5, 6, 7, 8);<BR>- Vec8s v_horPred, v_rightColumnN;<BR>- Vec8s v_im4;<BR>- Vec16uc v_im5;<BR>+ __m128i v_horPred4 = _mm_add_epi16(v_leftColumn, _mm_set1_epi16(8));<BR>+ const __m128i v_multi = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);<BR>+ __m128i v_horPred, v_rightColumnN;<BR>+ __m128i v_im4;<BR> <BR> COMP_PRED_PLANAR_ROW(0); // row 0<BR> COMP_PRED_PLANAR_ROW(1);<BR>@@ -1004,27 +1003,20 @@<BR> store_partial(const_int(8), &dst[X * dstStride], v_im5); \<BR> }<BR> <BR>-void intra_pred_planar8(pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride)<BR>+void
intra_pred_planar8(pixel* above, pixel* left, pixel* dst, intptr_t dstStride)<BR> {<BR>- int k;<BR> pixel bottomLeft, topRight;<BR>- int16_t leftColumn[8];<BR> <BR> // Get left and above reference column and row<BR>- Vec16uc im0 = (Vec16uc)load_partial(const_int(8), &src[-srcStride]); // topRow<BR>+ Vec16uc im0 = (Vec16uc)load_partial(const_int(8), (void*)above); // topRow<BR> Vec8s v_topRow = extend_low(im0);<BR> <BR>- for (k = 0; k < 8; k++)<BR>- {<BR>- leftColumn[k] = src[k * srcStride - 1];<BR>- }<BR>-<BR>- Vec8s v_leftColumn;<BR>- v_leftColumn.load(leftColumn); // leftColumn<BR>+ Vec8s v_leftColumn = _mm_loadl_epi64((__m128i*)left); // leftColu
mn<BR>+ v_leftColumn = _mm_unpacklo_epi8(v_leftColumn, _mm_setzero_si128());<BR> <BR> // Prepare intermediate variables used in interpolation<BR>- bottomLeft = src[8 * srcStride - 1];<BR>- topRight = src[8 - srcStride];<BR>+ bottomLeft = left[8];<BR>+ topRight = above[8];<BR> <BR> Vec8s v_bottomLeft(bottomLeft);<BR> Vec8s v_topRight(topRight);<BR>@@ -1055,29 +1047,18 @@<BR> #undef COMP_PRED_PLANAR_ROW<BR> <BR> #if INSTRSET >= 5<BR>-void intra_pred_planar8_sse4(pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride)<BR>+void intra_pred_planar8_sse4(pixel* above, pixel* left, pixel* dst, intptr_t dstStride)<BR> {<BR> pixel bottomLeft, topRight;<BR> <BR> // Get left and above reference column and row<BR>-
__m128i im0 = _mm_loadl_epi64((__m128i*)&src[0 - srcStride]); // topRow<BR>- __m128i v_topRow = _mm_unpacklo_epi8(im0, _mm_setzero_si128());<BR>-<BR>- __m128i v_leftColumn = _mm_setzero_si128();<BR>-<BR>- v_leftColumn = _mm_insert_epi8(v_leftColumn, src[0 * srcStride - 1], 0);<BR>- v_leftColumn = _mm_insert_epi8(v_leftColumn, src[1 * srcStride - 1], 1);<BR>- v_leftColumn = _mm_insert_epi8(v_leftColumn, src[2 * srcStride - 1], 2);<BR>- v_leftColumn = _mm_insert_epi8(v_leftColumn, src[3 * srcStride - 1], 3);<BR>- v_leftColumn = _mm_insert_epi8(v_leftColumn, src[4 * srcStride - 1], 4);<BR>- v_leftColumn = _mm_insert_epi8(v_leftColumn, src[5 * srcStride - 1], 5);<BR>- v_leftColumn = _mm_insert_epi8(v_leftColumn, src[6 * srcStride - 1], 6);<BR>- v_leftColumn = _mm_insert_epi8(v_leftColumn, src[
7 * srcStride - 1], 7);<BR>- v_leftColumn = _mm_unpacklo_epi8(v_leftColumn, _mm_setzero_si128());<BR>+ __m128i v_topRow = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)above)); // topRow<BR>+<BR>+ __m128i v_leftColumn = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)left));<BR> <BR> // Prepare intermediate variables used in interpolation<BR>- bottomLeft = src[8 * srcStride - 1];<BR>- topRight = src[8 - srcStride];<BR>+ bottomLeft = left[8];<BR>+ topRight = above[8];<BR> <BR> __m128i v_bottomLeft = _mm_set1_epi16(bottomLeft);<BR> __m128i v_topRight = _mm_set1_epi16(topRight);<BR>@@ -1094,18 +1075,8 @@<BR> __m128i v_im5;<BR> <BR> #define COMP_PRED_PLANAR_ROW(Y) { \<BR>- if ((
Y) < 4) { \<BR>- v_horPred = _mm_shufflelo_epi16(v_horPred4, ((Y) & 3) * 0x55); \<BR>- v_horPred = _mm_unpacklo_epi64(v_horPred, v_horPred); \<BR>- v_rightColumnN = _mm_shufflelo_epi16(v_rightColumn, ((Y) & 3) * 0x55); \<BR>- v_rightColumnN = _mm_unpacklo_epi64(v_rightColumnN, v_rightColumnN); \<BR>- } \<BR>- else { \<BR>- v_horPred = _mm_shufflehi_epi16(v_horPred4, ((Y) & 3) * 0x55); \<BR>- v_horPred = _mm_unpackhi_epi64(v_horPred, v_horPred); \<BR>-  
; v_rightColumnN = _mm_shufflehi_epi16(v_rightColumn, ((Y) & 3) * 0x55); \<BR>- v_rightColumnN = _mm_unpackhi_epi64(v_rightColumnN, v_rightColumnN); \<BR>- } \<BR>+ BROADCAST16(v_horPred4, (Y), v_horPred); \<BR>+ BROADCAST16(v_rightColumn, (Y), v_rightColumnN); \<BR> v_rightColumnN = _mm_mullo_epi16(v_rightColumnN, v_multiL); \<BR> v_horPred = _mm_add_epi16(v_horPred, v_rightColumnN); \<BR> v_topRow = _mm_add_epi16(v_topRow, v_bottomRow); \<BR>@@ -1148,29 +1119,22 @@<BR> v_im4_hi.store(&dst[X * dstStride + 8]); \<BR> }<BR> <BR>-void intra_pred_planar16(pixel* src, intptr
_t srcStride, pixel* dst, intptr_t dstStride)<BR>+void intra_pred_planar16(pixel* above, pixel* left, pixel* dst, intptr_t dstStride)<BR> {<BR>- int k;<BR> pixel bottomLeft, topRight;<BR>- int16_t leftColumn[16];<BR> <BR> // Get left and above reference column and row<BR> Vec8s v_topRow_lo, v_topRow_hi;<BR> <BR>- v_topRow_lo.load(&src[-srcStride]);<BR>- v_topRow_hi.load(&src[-srcStride + 8]);<BR>-<BR>- for (k = 0; k < 16; k++)<BR>- {<BR>- leftColumn[k] = src[k * srcStride - 1];<BR>- }<BR>+ v_topRow_lo.load(&above[0]);<BR>+ v_topRow_hi.load(&above[8]);<BR> <BR> Vec8s v_leftColumn;<BR>- v_leftColumn.load(leftColumn); // le
ftColumn<BR>+ v_leftColumn.load(left); // leftColumn<BR> <BR> // Prepare intermediate variables used in interpolation<BR>- bottomLeft = src[16 * srcStride - 1];<BR>- topRight = src[16 - srcStride];<BR>+ bottomLeft = left[16];<BR>+ topRight = above[16];<BR> <BR> Vec8s v_bottomLeft(bottomLeft);<BR> Vec8s v_topRight(topRight);<BR>@@ -1200,7 +1164,7 @@<BR> COMP_PRED_PLANAR_ROW(6);<BR> COMP_PRED_PLANAR_ROW(7); // row 7<BR> <BR>- v_leftColumn.load(leftColumn + 8); // leftColumn lower 8 rows<BR>+ v_leftColumn.load(left + 8); // leftColumn lower 8 rows<BR> v_rightColumn = v_topRight - v_leftColumn;<BR> v_
leftColumn = v_leftColumn << (2 + shift);<BR> v_horPred4 = v_leftColumn + Vec8s(16);<BR>@@ -1235,28 +1199,20 @@<BR> store_partial(const_int(16), &dst[X * dstStride], v_im5); \<BR> }<BR> <BR>-void intra_pred_planar16(pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride)<BR>+void intra_pred_planar16(pixel* above, pixel* left, pixel* dst, intptr_t dstStride)<BR> {<BR>- int k;<BR> pixel bottomLeft, topRight;<BR>- int16_t leftColumn[16];<BR> <BR> // Get left and above reference column and row<BR>- Vec16uc im0 = (Vec16uc)load_partial(const_int(16), &src[-srcStride]); // topRow<BR>+ Vec16uc im0 = (Vec16uc)load_partial(const_int(16), above); // topRow<BR> Vec8s v_topRow_lo = extend_low(im0);<BR> Vec8s v_topRow_h
i = extend_high(im0);<BR> <BR>- for (k = 0; k < 16; k++)<BR>- {<BR>- leftColumn[k] = src[k * srcStride - 1];<BR>- }<BR>-<BR>- Vec8s v_leftColumn;<BR>- v_leftColumn.load(leftColumn); // leftColumn<BR>+ Vec8s v_leftColumn = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)left), _mm_setzero_si128());<BR> <BR> // Prepare intermediate variables used in interpolation<BR>- bottomLeft = src[16 * srcStride - 1];<BR>- topRight = src[16 - srcStride];<BR>+ bottomLeft = left[16];<BR>+ topRight = above[16];<BR> <BR> Vec8s v_bottomLeft(bottomLeft);<BR> Vec8s v_topRight(topRight);<BR>@@ -1286,7 +1242,8 @@<BR> COMP_PRED_PLANAR_ROW(6);<BR>
COMP_PRED_PLANAR_ROW(7); // row 7<BR> <BR>- v_leftColumn.load(leftColumn + 8); // leftColumn lower 8 rows<BR>+ // leftColumn lower 8 rows<BR>+ v_leftColumn = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(left + 8)), _mm_setzero_si128());<BR> v_rightColumn = v_topRight - v_leftColumn;<BR> v_leftColumn = v_leftColumn << (2 + shift);<BR> v_horPred4 = v_leftColumn + Vec8s(16);<BR>@@ -1304,21 +1261,21 @@<BR> #undef COMP_PRED_PLANAR_ROW<BR> <BR> #if INSTRSET >= 5<BR>-void intra_pred_planar16_sse4(pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride)<BR>+void intra_pred_planar16_sse4(pixel* above, pixel* left, pixel* dst, intptr_t dstStride)<BR> {<BR> pixel bottomLeft, topRight;<BR> __m128i v_topRow[2];<BR> &nbs
p; __m128i v_bottomRow[2];<BR> <BR> // Get left and above reference column and row<BR>- __m128i im0 = _mm_loadu_si128((__m128i*)&src[0 - srcStride]); // topRow<BR>+ __m128i im0 = _mm_loadu_si128((__m128i*)above); // topRow<BR> <BR> v_topRow[0] = _mm_unpacklo_epi8(im0, _mm_setzero_si128());<BR> v_topRow[1] = _mm_unpackhi_epi8(im0, _mm_setzero_si128());<BR> <BR> // Prepare intermediate variables used in interpolation<BR>- bottomLeft = src[16 * srcStride - 1];<BR>- topRight = src[16 - srcStride];<BR>+ bottomLeft = left[16];<BR>+ topRight = above[16];<BR> <BR> __m128i v_bottomLeft = _mm_set1_epi16(bottomLeft);<BR> <BR>@@ -1333,10 +1290,10 @@<BR> __m128i v_im5;<BR> <BR>&n
bsp;#define COMP_PRED_PLANAR_ROW(Y) { \<BR>- v_horPred = _mm_cvtsi32_si128((src[(Y)*srcStride - 1] << 4) + 16); \<BR>+ v_horPred = _mm_cvtsi32_si128((left[(Y)] << 4) + 16); \<BR> v_horPred = _mm_shufflelo_epi16(v_horPred, 0); \<BR> v_horPred = _mm_shuffle_epi32(v_horPred, 0); \<BR>- __m128i _tmp = _mm_cvtsi32_si128(topRight - src[(Y)*srcStride - 1]); \<BR>+ __m128i _tmp = _mm_cvtsi32_si128(topRight - left[(Y)]); \<BR> _tmp = _mm_shufflelo_epi16(_tmp, 0); \<BR> _tmp = _mm_shuffle_epi32(_tmp, 0); \<BR> v_rightColumnN[0] = _mm_mullo_epi16(_tmp, v_multiL); \<BR>@@ -1374,15 +
1331,15 @@<BR> #endif // INSTRSET >= 5<BR> <BR> #if INSTRSET >= 5<BR>-void intra_pred_planar32_sse4(pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride)<BR>+void intra_pred_planar32_sse4(pixel* above, pixel* left, pixel* dst, intptr_t dstStride)<BR> {<BR> pixel bottomLeft, topRight;<BR> __m128i v_topRow[4];<BR> __m128i v_bottomRow[4];<BR> <BR> // Get left and above reference column and row<BR>- __m128i im0 = _mm_loadu_si128((__m128i*)&src[0 - srcStride]); // topRow<BR>- __m128i im1 = _mm_loadu_si128((__m128i*)&src[16 - srcStride]); // topRow<BR>+ __m128i im0 = _mm_loadu_si128((__m128i*)&above[0]); // topRow<BR>+ __m128i im1 = _mm_loadu_si128((__m128i*)&above[16]); // topRow<BR> <BR> v_topRow[0] = _mm_unpacklo_epi8(im0, _mm_setzero_s
i128());<BR> v_topRow[1] = _mm_unpackhi_epi8(im0, _mm_setzero_si128());<BR>@@ -1390,8 +1347,8 @@<BR> v_topRow[3] = _mm_unpackhi_epi8(im1, _mm_setzero_si128());<BR> <BR> // Prepare intermediate variables used in interpolation<BR>- bottomLeft = src[32 * srcStride - 1];<BR>- topRight = src[32 - srcStride];<BR>+ bottomLeft = left[32];<BR>+ topRight = above[32];<BR> <BR> __m128i v_bottomLeft = _mm_set1_epi16(bottomLeft);<BR> <BR>@@ -1410,10 +1367,10 @@<BR> __m128i v_im5[2];<BR> <BR> #define COMP_PRED_PLANAR_ROW(Y) { \<BR>- v_horPred = _mm_cvtsi32_si128((src[(Y)*srcStride - 1] << 5) + 32); \<BR>+ v_horPred = _mm_cvtsi32_si128((left[(Y)] << 5) + 32); \<BR>&nb
sp; v_horPred = _mm_shufflelo_epi16(v_horPred, 0); \<BR> v_horPred = _mm_shuffle_epi32(v_horPred, 0); \<BR>- __m128i _tmp = _mm_cvtsi32_si128(topRight - src[(Y)*srcStride - 1]); \<BR>+ __m128i _tmp = _mm_cvtsi32_si128(topRight - left[(Y)]); \<BR> _tmp = _mm_shufflelo_epi16(_tmp, 0); \<BR> _tmp = _mm_shuffle_epi32(_tmp, 0); \<BR> v_rightColumnN[0] = _mm_mullo_epi16(_tmp, v_multiL); \<BR>@@ -1451,17 +1408,17 @@<BR> #endif // INSTRSET >= 5<BR> <BR> #if INSTRSET >= 5<BR>-void intra_pred_planar64_sse4(pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride)<BR>+void intra_pred_planar64_sse4(pixel* above, pixel* left, pixel* dst, intptr_t dstStride)
<BR> {<BR> pixel bottomLeft, topRight;<BR> __m128i v_topRow[8];<BR> __m128i v_bottomRow[8];<BR> <BR> // Get left and above reference column and row<BR>- __m128i im0 = _mm_loadu_si128((__m128i*)&src[0 - srcStride]); // topRow<BR>- __m128i im1 = _mm_loadu_si128((__m128i*)&src[16 - srcStride]); // topRow<BR>- __m128i im2 = _mm_loadu_si128((__m128i*)&src[32 - srcStride]); // topRow<BR>- __m128i im3 = _mm_loadu_si128((__m128i*)&src[48 - srcStride]); // topRow<BR>+ __m128i im0 = _mm_loadu_si128((__m128i*)&above[0]); // topRow<BR>+ __m128i im1 = _mm_loadu_si128((__m128i*)&above[16]); // topRow<BR>+ __m128i im2 = _mm_loadu_si128((__m128i*)&above[32]); // topRow<BR>+ __m128i im3 = _mm_loadu_si128((__m128i*)&above[4
8]); // topRow<BR> <BR> v_topRow[0] = _mm_unpacklo_epi8(im0, _mm_setzero_si128());<BR> v_topRow[1] = _mm_unpackhi_epi8(im0, _mm_setzero_si128());<BR>@@ -1473,8 +1430,8 @@<BR> v_topRow[7] = _mm_unpackhi_epi8(im3, _mm_setzero_si128());<BR> <BR> // Prepare intermediate variables used in interpolation<BR>- bottomLeft = src[64 * srcStride - 1];<BR>- topRight = src[64 - srcStride];<BR>+ bottomLeft = left[64];<BR>+ topRight = above[64];<BR> <BR> __m128i v_bottomLeft = _mm_set1_epi16(bottomLeft);<BR> <BR>@@ -1501,10 +1458,10 @@<BR> __m128i v_im5[4];<BR> <BR> #define COMP_PRED_PLANAR_ROW(Y) { \<BR>- v_horPred = _mm_cvtsi32_si128((src[(Y)*srcStride - 1] << 6) + 64); \<BR>+ &n
bsp; v_horPred = _mm_cvtsi32_si128((left[(Y)] << 6) + 64); \<BR> v_horPred = _mm_shufflelo_epi16(v_horPred, 0); \<BR> v_horPred = _mm_shuffle_epi32(v_horPred, 0); \<BR>- __m128i _tmp = _mm_cvtsi32_si128(topRight - src[(Y)*srcStride - 1]); \<BR>+ __m128i _tmp = _mm_cvtsi32_si128(topRight - left[(Y)]); \<BR> _tmp = _mm_shufflelo_epi16(_tmp, 0); \<BR> _tmp = _mm_shuffle_epi32(_tmp, 0); \<BR> v_rightColumnN[0] = _mm_mullo_epi16(_tmp, v_multiL); \<BR>@@ -1563,7 +1520,7 @@<BR> <BR> #endif /* if HIGH_BIT_DEPTH */<BR> <BR>-typedef void intra_pred_planar_t (pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride);<BR
>+typedef void intra_pred_planar_t (pixel* above, pixel* left, pixel* dst, intptr_t dstStride);<BR> intra_pred_planar_t *intraPlanarN[] =<BR> {<BR> #if !HIGH_BIT_DEPTH && INSTRSET >= 5<BR>@@ -1579,18 +1536,18 @@<BR> #endif<BR> };<BR> <BR>-void intra_pred_planar(pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width)<BR>+void intra_pred_planar(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int width)<BR> {<BR> int nLog2Size = g_convertToBit[width] + 2;<BR> <BR> #if (!HIGH_BIT_DEPTH) && (INSTRSET >= 5)<BR>- intraPlanarN[nLog2Size - 2](src, srcStride, dst, dstStride);<BR>+ intraPlanarN[nLog2Size - 2](above, left, dst, dstStride);<BR> return;<BR> #else<BR> int k, l, bottomLeft, topRight;<BR> int horPred;<BR> // OPT_ME: when width is
64, the shift1D is 8, then the dynamic range is [-65280, 65280], so we have to use 32 bits here<BR>- int32_t leftColumn[MAX_CU_SIZE + 1], topRow[MAX_CU_SIZE + 1];<BR>+ int32_t leftColumn[MAX_CU_SIZE], topRow[MAX_CU_SIZE];<BR> // CHECK_ME: dynamic range is 9 bits or 15 bits(I assume max input bit_depth is 14 bits)<BR> int16_t bottomRow[MAX_CU_SIZE], rightColumn[MAX_CU_SIZE];<BR> int blkSize = width;<BR>@@ -1600,20 +1557,20 @@<BR> <BR> if (width < 32)<BR> {<BR>- intraPlanarN[nLog2Size - 2](src, srcStride, dst, dstStride);<BR>+ intraPlanarN[nLog2Size - 2](above, left, dst, dstStride);<BR> return;<BR> }<BR> <BR> // Get left and above reference
column and row<BR>- for (k = 0; k < blkSize + 1; k++)<BR>+ for (k = 0; k < blkSize; k++)<BR> {<BR>- topRow[k] = src[k - srcStride];<BR>- leftColumn[k] = src[k * srcStride - 1];<BR>+ topRow[k] = above[k];<BR>+ leftColumn[k] = left[k];<BR> }<BR> <BR> // Prepare intermediate variables used in interpolation<BR>- bottomLeft = leftColumn[blkSize];<BR>- topRight = topRow[blkSize];<BR>+ bottomLeft = left[blkSize];<BR>+ topRight = above[blkSize];<BR> for (k = 0; k < blkSize; k++)<BR> {<BR> bottomRow[k] = b
ottomLeft - topRow[k];<BR>diff -r f813f110d69a -r a4013cdafef0 source/encoder/compress.cpp<BR>--- a/source/encoder/compress.cpp Thu Jul 18 02:10:37 2013 -0500<BR>+++ b/source/encoder/compress.cpp Thu Jul 18 15:52:51 2013 +0800<BR>@@ -123,23 +123,25 @@<BR> CandNum = 0;<BR> UInt modeCosts[35];<BR> Bool bFilter = (width <= 16);<BR>- Pel *src = m_search->getPredicBuf();<BR> <BR> Pel *pAbove0 = m_search->refAbove + width - 1;<BR> Pel *pAbove1 = m_search->refAboveFlt + width - 1;<BR> Pel *pLeft0 = m_search->refLeft + width - 1;<BR> Pel *pLef
t1 = m_search->refLeftFlt + width - 1;<BR>+ Pel *pAbove = pAbove0;<BR>+ Pel *pLeft = pLeft0;<BR> <BR> // 1<BR>- primitives.intra_pred_dc(pAbove0 + 1, pLeft0 + 1, pred, stride, width, bFilter);<BR>+ primitives.intra_pred_dc((pixel*)pAbove0 + 1, (pixel*)pLeft0 + 1, pred, stride, width, bFilter);<BR> modeCosts[DC_IDX] = sa8d(fenc, stride, pred, stride);<BR> <BR> // 0<BR> if (width >= 8 && width <= 32)<BR> {<BR>- src += ADI_BUF_STRIDE * (2 * width + 1);<BR>+ &n
bsp; pAbove = pAbove1;<BR>+ pLeft = pLeft1;<BR> }<BR>- primitives.intra_pred_planar(src + ADI_BUF_STRIDE + 1, ADI_BUF_STRIDE, pred, stride, width);<BR>+ primitives.intra_pred_planar(pAbove + 1, pLeft + 1, pred, stride, width);<BR> modeCosts[PLANAR_IDX] = sa8d(fenc, stride, pred, stride);<BR> <BR> // 33 Angle modes once<BR>diff -r f813f110d69a -r a4013cdafef0 source/test/intrapredharness.cpp<BR>--- a/source/test/intrapredharness.cpp Thu Jul 18 02:10:37 2013 -0500<BR>+++ b/source/test/intrapredharness.cpp Thu Jul 18 15:52:51 2013 +0800<BR>@@ -117,17 +117,26 @@<BR> {<BR>
for (int i = 0; i <= 100; i++)<BR> {<BR>+ pixel left[MAX_CU_SIZE * 2 + 1];<BR>+ for (int k = 0; k < width * 2 + 1; k++)<BR>+ {<BR>+ left[k] = pixel_buff[j - 1 + k * ADI_BUF_STRIDE];<BR>+ }<BR> #if _DEBUG<BR> memset(pixel_out_vec, 0xCD, out_size);<BR> memset(pixel_out_c, 0xCD, out_size);<BR> #endif<BR>- ref(pixel_buff + j, ADI_BUF_STRIDE, pixel_out_
c, FENC_STRIDE, width);<BR>- opt(pixel_buff + j, ADI_BUF_STRIDE, pixel_out_vec, FENC_STRIDE, width);<BR>+ ref(pixel_buff + j - ADI_BUF_STRIDE, left + 1, pixel_out_c, FENC_STRIDE, width);<BR>+ opt(pixel_buff + j - ADI_BUF_STRIDE, left + 1, pixel_out_vec, FENC_STRIDE, width);<BR> <BR> for (int k = 0; k < width; k++)<BR> {<BR> if (memcmp(pixel_out_vec + k * FENC_STRIDE, pixel_out_c + k * FENC_STRIDE, width))<BR> {<BR>+#if _DEBUG<B
R>+ ref(pixel_buff + j - ADI_BUF_STRIDE, left + 1, pixel_out_c, FENC_STRIDE, width);<BR>+ opt(pixel_buff + j - ADI_BUF_STRIDE, left + 1, pixel_out_vec, FENC_STRIDE, width);<BR>+#endif<BR> return false;<BR> }<BR> }<BR>@@ -292,7 +301,7 @@<BR> width = ii;<BR> printf("intra_planar%2dx%d", ii, ii);<BR> &nb
sp; REPORT_SPEEDUP(opt.intra_pred_planar, ref.intra_pred_planar,<BR>- pixel_buff + srcStride, srcStride, pixel_out_vec, FENC_STRIDE, width);<BR>+ pixel_buff + srcStride, pixel_buff, pixel_out_vec, FENC_STRIDE, width);<BR> }<BR> }<BR> if (opt.intra_pred_ang)<BR></P>
<DIV></DIV></div>