[x265] [PATCH]Improvement intra_pred_planar
chen
chenm003 at 163.com
Thu Jul 18 10:21:25 CEST 2013
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1374133971 -28800
# Node ID a4013cdafef00502efe0d496dcb3c4f2bce966a4
# Parent f813f110d69a1a6650e813dd4e612216982a0264
intrapred: improvement intra_pred_planar
diff -r f813f110d69a -r a4013cdafef0 source/Lib/TLibCommon/TComPrediction.cpp
--- a/source/Lib/TLibCommon/TComPrediction.cpp Thu Jul 18 02:10:37 2013 -0500
+++ b/source/Lib/TLibCommon/TComPrediction.cpp Thu Jul 18 15:52:51 2013 +0800
@@ -177,17 +177,16 @@
}
// get starting pixel in block
- Int sw = ADI_BUF_STRIDE;
Bool bFilter = (size <= 16);
// Create the prediction
if (dirMode == PLANAR_IDX)
{
- primitives.intra_pred_planar(src + sw + 1, sw, dst, stride, size);
+ primitives.intra_pred_planar((pixel*)refAbv + 1, (pixel*)refLft + 1, (pixel*)dst, stride, size);
}
else if (dirMode == DC_IDX)
{
- primitives.intra_pred_dc(refAbv + 1, refLft + 1, dst, stride, size, bFilter);
+ primitives.intra_pred_dc((pixel*)refAbv + 1, (pixel*)refLft + 1, (pixel*)dst, stride, size, bFilter);
}
else
{
@@ -198,33 +197,28 @@
// Angular chroma
Void TComPrediction::predIntraChromaAng(Pel* src, UInt dirMode, Pel* dst, UInt stride, Int width)
{
+ // Create the prediction
+ Pel refAbv[3 * MAX_CU_SIZE];
+ Pel refLft[3 * MAX_CU_SIZE];
+ int limit = (dirMode <= 25 && dirMode >= 11) ? (width + 1 + 1) : (2 * width + 1);
+ memcpy(refAbv + width - 1, src, (limit) * sizeof(Pel));
+ for (int k = 0; k < limit; k++)
+ {
+ refLft[k + width - 1] = src[k * ADI_BUF_STRIDE];
+ }
+
// get starting pixel in block
- Int sw = ADI_BUF_STRIDE;
-
if (dirMode == PLANAR_IDX)
{
- primitives.intra_pred_planar(src + sw + 1, sw, dst, stride, width);
+ primitives.intra_pred_planar((pixel*)refAbv + width - 1 + 1, (pixel*)refLft + width - 1 + 1, (pixel*)dst, stride, width);
+ }
+ else if (dirMode == DC_IDX)
+ {
+ primitives.intra_pred_dc(refAbv + width - 1 + 1, refLft + width - 1 + 1, dst, stride, width, false);
}
else
{
- // Create the prediction
- Pel refAbv[3 * MAX_CU_SIZE];
- Pel refLft[3 * MAX_CU_SIZE];
- int limit = (dirMode <= 25 && dirMode >= 11) ? (width + 1) : (2 * width + 1);
- memcpy(refAbv + width - 1, src, (limit) * sizeof(Pel));
- for (int k = 0; k < limit; k++)
- {
- refLft[k + width - 1] = src[k * sw];
- }
-
- if (dirMode == DC_IDX)
- {
- primitives.intra_pred_dc(refAbv + width - 1 + 1, refLft + width - 1 + 1, dst, stride, width, false);
- }
- else
- {
- primitives.intra_pred_ang(dst, stride, width, dirMode, false, refLft + width - 1, refAbv + width - 1);
- }
+ primitives.intra_pred_ang(dst, stride, width, dirMode, false, refLft + width - 1, refAbv + width - 1);
}
}
diff -r f813f110d69a -r a4013cdafef0 source/Lib/TLibEncoder/TEncSearch.cpp
--- a/source/Lib/TLibEncoder/TEncSearch.cpp Thu Jul 18 02:10:37 2013 -0500
+++ b/source/Lib/TLibEncoder/TEncSearch.cpp Thu Jul 18 15:52:51 2013 +0800
@@ -1975,6 +1975,8 @@
Pel *pAbove1 = refAboveFlt + width - 1;
Pel *pLeft0 = refLeft + width - 1;
Pel *pLeft1 = refLeftFlt + width - 1;
+ Pel *above = pAbove0;
+ Pel *left = pLeft0;
// 1
primitives.intra_pred_dc(pAbove0 + 1, pLeft0 + 1, pred, stride, width, bFilter);
@@ -1984,8 +1986,10 @@
if (width >= 8 && width <= 32)
{
predSrc += ADI_BUF_STRIDE * (2 * width + 1);
+ above = pAbove1;
+ left = pLeft1;
}
- primitives.intra_pred_planar(predSrc + ADI_BUF_STRIDE + 1, ADI_BUF_STRIDE, pred, stride, width);
+ primitives.intra_pred_planar((pixel*)above + 1, (pixel*)left + 1, pred, stride, width);
modeCosts[PLANAR_IDX] = sa8d(fenc, stride, pred, stride);
// 33 Angle modes once
diff -r f813f110d69a -r a4013cdafef0 source/common/intrapred.cpp
--- a/source/common/intrapred.cpp Thu Jul 18 02:10:37 2013 -0500
+++ b/source/common/intrapred.cpp Thu Jul 18 15:52:51 2013 +0800
@@ -98,7 +98,7 @@
}
}
-void PredIntraPlanar(pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width)
+void PredIntraPlanar(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int width)
{
//assert(width == height);
@@ -117,8 +117,8 @@
// Get left and above reference column and row
for (k = 0; k < blkSize + 1; k++)
{
- topRow[k] = src[k - srcStride];
- leftColumn[k] = src[k * srcStride - 1];
+ topRow[k] = above[k];
+ leftColumn[k] = left[k];
}
// Prepare intermediate variables used in interpolation
diff -r f813f110d69a -r a4013cdafef0 source/common/primitives.h
--- a/source/common/primitives.h Thu Jul 18 02:10:37 2013 -0500
+++ b/source/common/primitives.h Thu Jul 18 15:52:51 2013 +0800
@@ -195,7 +195,7 @@
typedef void (*pixelsub_sp_t)(int bx, int by, short *dst, intptr_t dstride, pixel *src0, pixel *src1, intptr_t sstride0, intptr_t sstride1);
typedef void (*intra_dc_t)(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int width, int bFilter);
-typedef void (*intra_planar_t)(pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width);
+typedef void (*intra_planar_t)(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int width);
typedef void (*intra_ang_t)(pixel* dst, int dstStride, int width, int dirMode, bool bFilter, pixel *refLeft, pixel *refAbove);
typedef void (*intra_allangs_t)(pixel *dst, pixel *above0, pixel *left0, pixel *above1, pixel *left1, bool bLuma);
diff -r f813f110d69a -r a4013cdafef0 source/common/vec/intrapred.inc
--- a/source/common/vec/intrapred.inc Thu Jul 18 02:10:37 2013 -0500
+++ b/source/common/vec/intrapred.inc Thu Jul 18 15:52:51 2013 +0800
@@ -746,27 +746,42 @@
#endif // if HIGH_BIT_DEPTH
}
+#if INSTRSET >= 4 // SSSE3
+ #define BROADCAST16(a, d, x) { \
+ const __m128i mask = _mm_set1_epi16( (((d) * 2) | ((d) * 2 + 1) << 8) ); \
+ (x) = _mm_shuffle_epi8((a), mask); \
+ }
+#else
+ #define BROADCAST16(a, d, x) { \
+ const int dL = (d) & 3; \
+ const int dH = ((d)-4) & 3; \
+ if (d>=4) { \
+ (x) = _mm_shufflehi_epi16((a), dH * 0x55); \
+ (x) = _mm_unpackhi_epi64((x), (x)); \
+ } \
+ else { \
+ (x) = _mm_shufflelo_epi16((a), dL * 0x55); \
+ (x) = _mm_unpacklo_epi64((x), (x)); \
+ } \
+ }
+#endif
+
+
#if HIGH_BIT_DEPTH
-// CHECK_ME: I am not sure the v_rightColumnN will be overflow when input as 12bpp
-void intra_pred_planar4(pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride)
+// CHECK_ME: I am not sure the v_rightColumnN will be overflow when input is 12bpp
+void intra_pred_planar4(pixel* above, pixel* left, pixel* dst, intptr_t dstStride)
{
- int k, bottomLeft, topRight;
+ int bottomLeft, topRight;
// NOTE: I use 16-bits is enough here, because we have least than 13-bits as input, and shift left by 2, it is 15-bits
- int16_t leftColumn[4];
// Get left and above reference column and row
- Vec8s v_topRow = (Vec8s)load_partial(const_int(8), &src[-srcStride]); // topRow
-
- for (k = 0; k < 4; k++)
- {
- leftColumn[k] = src[k * srcStride - 1];
- }
-
- Vec8s v_leftColumn = (Vec8s)load_partial(const_int(8), leftColumn); // leftColumn
+ Vec8s v_topRow = (Vec8s)load_partial(const_int(8), above); // topRow
+
+ Vec8s v_leftColumn = (Vec8s)load_partial(const_int(8), left); // leftColumn
// Prepare intermediate variables used in interpolation
- bottomLeft = src[4 * srcStride - 1];
- topRight = src[4 - srcStride];
+ bottomLeft = left[4];
+ topRight = above[4];
Vec8s v_bottomLeft(bottomLeft);
Vec8s v_topRight(topRight);
@@ -819,51 +834,45 @@
}
#else /* if HIGH_BIT_DEPTH */
-void intra_pred_planar4(pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride)
+void intra_pred_planar4(pixel* above, pixel* left, pixel* dst, intptr_t dstStride)
{
- int k;
pixel bottomLeft, topRight;
// Get left and above reference column and row
- Vec16uc im0 = (Vec16uc)load_partial(const_int(4), &src[-srcStride]); // topRow
- Vec8s v_topRow = extend_low(im0);
-
- int16_t leftColumn[4];
-
- for (k = 0; k < 4; k++)
- {
- leftColumn[k] = src[k * srcStride - 1];
- }
-
- Vec8s v_leftColumn = (Vec8s)load_partial(const_int(8), (void*)leftColumn); // leftColumn
+ __m128i im0 = _mm_cvtsi32_si128(*(int*)above); // topRow
+ __m128i v_topRow = _mm_unpacklo_epi8(im0, _mm_setzero_si128());
+
+ __m128i v_leftColumn = _mm_cvtsi32_si128(*(int*)left); // leftColumn
+ v_leftColumn = _mm_unpacklo_epi8(v_leftColumn, _mm_setzero_si128());
// Prepare intermediate variables used in interpolation
- bottomLeft = src[4 * srcStride - 1];
- topRight = src[4 - srcStride];
-
- Vec8s v_bottomLeft(bottomLeft);
- Vec8s v_topRight(topRight);
-
- Vec8s v_bottomRow = v_bottomLeft - v_topRow;
- Vec8s v_rightColumn = v_topRight - v_leftColumn;
-
- v_topRow = v_topRow << const_int(2);
- v_leftColumn = v_leftColumn << const_int(2);
-
- Vec8s v_horPred4 = v_leftColumn + Vec8s(4);
- const Vec8s v_multi(1, 2, 3, 4, 5, 6, 7, 8);
- Vec8s v_horPred, v_rightColumnN;
- Vec8s v_im4;
- Vec16uc v_im5;
+ bottomLeft = left[4];
+ topRight = above[4];
+
+ __m128i v_bottomLeft = _mm_set1_epi16(bottomLeft);
+ __m128i v_topRight = _mm_set1_epi16(topRight);
+
+ __m128i v_bottomRow = _mm_sub_epi16(v_bottomLeft, v_topRow);
+ __m128i v_rightColumn = _mm_sub_epi16(v_topRight, v_leftColumn);
+
+ v_topRow = _mm_slli_epi16(v_topRow, 2);
+ v_leftColumn = _mm_slli_epi16(v_leftColumn, 2);
+
+ __m128i v_horPred4 = _mm_add_epi16(v_leftColumn, _mm_set1_epi16(4));
+ const __m128i v_multi = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+ __m128i v_horPred, v_rightColumnN;
+ __m128i v_im4;
+ __m128i v_im5;
#define COMP_PRED_PLANAR4_ROW(X) { \
- v_horPred = broadcast(const_int((X)), v_horPred4); \
- v_rightColumnN = broadcast(const_int((X)), v_rightColumn) * v_multi; \
- v_horPred = v_horPred + v_rightColumnN; \
- v_topRow = v_topRow + v_bottomRow; \
- v_im4 = (Vec8s)(v_horPred + v_topRow) >> const_int(3); \
- v_im5 = compress_unsafe(v_im4, v_im4); \
- store_partial(const_int(4), &dst[(X)*dstStride], v_im5); \
+ BROADCAST16(v_horPred4, (X), v_horPred); \
+ BROADCAST16(v_rightColumn, (X), v_rightColumnN); \
+ v_rightColumnN = _mm_mullo_epi16(v_rightColumnN, v_multi); \
+ v_horPred = _mm_add_epi16(v_horPred, v_rightColumnN); \
+ v_topRow = _mm_add_epi16(v_topRow, v_bottomRow); \
+ v_im4 = _mm_srai_epi16(_mm_add_epi16(v_horPred, v_topRow), 3); \
+ v_im5 = _mm_packus_epi16(v_im4, v_im4); \
+ *(int*)&dst[(X)*dstStride] = _mm_cvtsi128_si32(v_im5); \
}
COMP_PRED_PLANAR4_ROW(0)
@@ -875,19 +884,19 @@
}
#if INSTRSET >= 5
-void intra_pred_planar4_sse4(pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride)
+void intra_pred_planar4_sse4(pixel* above, pixel* left, pixel* dst, intptr_t dstStride)
{
pixel bottomLeft, topRight;
// Get left and above reference column and row
- __m128i im0 = _mm_cvtsi32_si128(*(uint32_t*)&src[-srcStride]); // topRow
- __m128i v_topRow = _mm_unpacklo_epi8(im0, _mm_setzero_si128());
+ __m128i im0 = _mm_cvtsi32_si128(*(int*)above); // topRow
+ __m128i v_topRow = _mm_cvtepu8_epi16(im0);
v_topRow = _mm_shuffle_epi32(v_topRow, 0x44);
// Prepare intermediate variables used in interpolation
- bottomLeft = src[4 * srcStride - 1];
- topRight = src[4 - srcStride];
+ bottomLeft = left[4];
+ topRight = above[4];
__m128i v_bottomLeft = _mm_set1_epi16(bottomLeft);
__m128i v_bottomRow = _mm_sub_epi16(v_bottomLeft, v_topRow);
@@ -904,14 +913,14 @@
v_bottomRow = _mm_slli_epi16(v_bottomRow, 1);
#define COMP_PRED_PLANAR_2ROW(Y) { \
- _tmp0 = _mm_cvtsi32_si128((src[((Y)) * srcStride - 1] << 2) + 4); \
+ _tmp0 = _mm_cvtsi32_si128((left[(Y)] << 2) + 4); \
_tmp0 = _mm_shufflelo_epi16(_tmp0, 0); \
- _tmp1 = _mm_cvtsi32_si128((src[((Y)+1) * srcStride - 1] << 2) + 4); \
+ _tmp1 = _mm_cvtsi32_si128((left[((Y)+1)] << 2) + 4); \
_tmp1 = _mm_shufflelo_epi16(_tmp1, 0); \
v_horPred = _mm_unpacklo_epi64(_tmp0, _tmp1); \
- _tmp0 = _mm_cvtsi32_si128(topRight - src[((Y)) * srcStride - 1]); \
+ _tmp0 = _mm_cvtsi32_si128(topRight - left[(Y)]); \
_tmp0 = _mm_shufflelo_epi16(_tmp0, 0); \
- _tmp1 = _mm_cvtsi32_si128(topRight - src[((Y)+1) * srcStride - 1]); \
+ _tmp1 = _mm_cvtsi32_si128(topRight - left[((Y)+1)]); \
_tmp1 = _mm_shufflelo_epi16(_tmp1, 0); \
v_rightColumnN = _mm_unpacklo_epi64(_tmp0, _tmp1); \
v_rightColumnN = _mm_mullo_epi16(v_rightColumnN, v_multi_2Row); \
@@ -938,48 +947,38 @@
#define COMP_PRED_PLANAR_ROW(X) { \
v_horPred = permute8s<X, X, X, X, X, X, X, X>(v_horPred4); \
v_rightColumnN = permute8s<X, X, X, X, X, X, X, X>(v_rightColumn) * v_multi; \
- v_horPred = v_horPred + v_rightColumnN; \
- v_topRow = v_topRow + v_bottomRow; \
- v_im4 = (Vec8s)(v_horPred + v_topRow) >> (3 + shift); \
- store_partial(const_int(16), &dst[X * dstStride], v_im4); \
-}
-
-void intra_pred_planar8(pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride)
+ v_horPred = _mm_add_epi16(v_horPred, v_rightColumnN); \
+ v_topRow = _mm_add_epi16(v_topRow, v_bottomRow); \
+ v_im4 = _mm_srai_epi16(_mm_add_epi16(v_horPred, v_topRow), (3 + 1)); \
+ _mm_storeu_si128((__m128i*)&dst[X * dstStride], v_im4); \
+}
+
+void intra_pred_planar8(pixel* above, pixel* left, pixel* dst, intptr_t dstStride)
{
- int k, bottomLeft, topRight;
-
- int16_t leftColumn[8];
+ int bottomLeft, topRight;
// Get left and above reference column and row
- Vec8s v_topRow = (Vec8s)load_partial(const_int(16), &src[-srcStride]); // topRow
-
- for (k = 0; k < 8; k++)
- {
- leftColumn[k] = src[k * srcStride - 1];
- }
-
- Vec8s v_leftColumn = (Vec8s)load_partial(const_int(16), leftColumn); // leftColumn
+ __m128i v_topRow = _mm_loadu_si128((__m128i*)above); // topRow
+ __m128i v_leftColumn = _mm_loadu_si128((__m128i*)left); // leftColumn
// Prepare intermediate variables used in interpolation
- bottomLeft = src[8 * srcStride - 1];
- topRight = src[8 - srcStride];
-
- Vec8s v_bottomLeft(bottomLeft);
- Vec8s v_topRight(topRight);
-
- Vec8s v_bottomRow = v_bottomLeft - v_topRow;
- Vec8s v_rightColumn = v_topRight - v_leftColumn;
-
- int shift = g_convertToBit[8]; // Using value corresponding to width = 8
- v_topRow = v_topRow << (2 + shift);
- v_leftColumn = v_leftColumn << (2 + shift);
+ bottomLeft = left[8];
+ topRight = above[8];
+
+ __m128i v_bottomLeft = _mm_set1_epi16(bottomLeft);
+ __m128i v_topRight = _mm_set1_epi16(topRight);
+
+ __m128i v_bottomRow = _mm_sub_epi16(v_bottomLeft, v_topRow);
+ __m128i v_rightColumn = _mm_sub_epi16(v_topRight, v_leftColumn);
+
+ v_topRow = _mm_slli_epi16(v_topRow, (2 + 1));
+ v_leftColumn = _mm_slli_epi16(v_leftColumn, (2 + 1));
// Generate prediction signal
- Vec8s v_horPred4 = v_leftColumn + Vec8s(8);
- const Vec8s v_multi(1, 2, 3, 4, 5, 6, 7, 8);
- Vec8s v_horPred, v_rightColumnN;
- Vec8s v_im4;
- Vec16uc v_im5;
+ __m128i v_horPred4 = _mm_add_epi16(v_leftColumn, _mm_set1_epi16(8));
+ const __m128i v_multi = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+ __m128i v_horPred, v_rightColumnN;
+ __m128i v_im4;
COMP_PRED_PLANAR_ROW(0); // row 0
COMP_PRED_PLANAR_ROW(1);
@@ -1004,27 +1003,20 @@
store_partial(const_int(8), &dst[X * dstStride], v_im5); \
}
-void intra_pred_planar8(pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride)
+void intra_pred_planar8(pixel* above, pixel* left, pixel* dst, intptr_t dstStride)
{
- int k;
pixel bottomLeft, topRight;
- int16_t leftColumn[8];
// Get left and above reference column and row
- Vec16uc im0 = (Vec16uc)load_partial(const_int(8), &src[-srcStride]); // topRow
+ Vec16uc im0 = (Vec16uc)load_partial(const_int(8), (void*)above); // topRow
Vec8s v_topRow = extend_low(im0);
- for (k = 0; k < 8; k++)
- {
- leftColumn[k] = src[k * srcStride - 1];
- }
-
- Vec8s v_leftColumn;
- v_leftColumn.load(leftColumn); // leftColumn
+ Vec8s v_leftColumn = _mm_loadl_epi64((__m128i*)left); // leftColumn
+ v_leftColumn = _mm_unpacklo_epi8(v_leftColumn, _mm_setzero_si128());
// Prepare intermediate variables used in interpolation
- bottomLeft = src[8 * srcStride - 1];
- topRight = src[8 - srcStride];
+ bottomLeft = left[8];
+ topRight = above[8];
Vec8s v_bottomLeft(bottomLeft);
Vec8s v_topRight(topRight);
@@ -1055,29 +1047,18 @@
#undef COMP_PRED_PLANAR_ROW
#if INSTRSET >= 5
-void intra_pred_planar8_sse4(pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride)
+void intra_pred_planar8_sse4(pixel* above, pixel* left, pixel* dst, intptr_t dstStride)
{
pixel bottomLeft, topRight;
// Get left and above reference column and row
- __m128i im0 = _mm_loadl_epi64((__m128i*)&src[0 - srcStride]); // topRow
- __m128i v_topRow = _mm_unpacklo_epi8(im0, _mm_setzero_si128());
-
- __m128i v_leftColumn = _mm_setzero_si128();
-
- v_leftColumn = _mm_insert_epi8(v_leftColumn, src[0 * srcStride - 1], 0);
- v_leftColumn = _mm_insert_epi8(v_leftColumn, src[1 * srcStride - 1], 1);
- v_leftColumn = _mm_insert_epi8(v_leftColumn, src[2 * srcStride - 1], 2);
- v_leftColumn = _mm_insert_epi8(v_leftColumn, src[3 * srcStride - 1], 3);
- v_leftColumn = _mm_insert_epi8(v_leftColumn, src[4 * srcStride - 1], 4);
- v_leftColumn = _mm_insert_epi8(v_leftColumn, src[5 * srcStride - 1], 5);
- v_leftColumn = _mm_insert_epi8(v_leftColumn, src[6 * srcStride - 1], 6);
- v_leftColumn = _mm_insert_epi8(v_leftColumn, src[7 * srcStride - 1], 7);
- v_leftColumn = _mm_unpacklo_epi8(v_leftColumn, _mm_setzero_si128());
+ __m128i v_topRow = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)above)); // topRow
+
+ __m128i v_leftColumn = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)left));
// Prepare intermediate variables used in interpolation
- bottomLeft = src[8 * srcStride - 1];
- topRight = src[8 - srcStride];
+ bottomLeft = left[8];
+ topRight = above[8];
__m128i v_bottomLeft = _mm_set1_epi16(bottomLeft);
__m128i v_topRight = _mm_set1_epi16(topRight);
@@ -1094,18 +1075,8 @@
__m128i v_im5;
#define COMP_PRED_PLANAR_ROW(Y) { \
- if ((Y) < 4) { \
- v_horPred = _mm_shufflelo_epi16(v_horPred4, ((Y) & 3) * 0x55); \
- v_horPred = _mm_unpacklo_epi64(v_horPred, v_horPred); \
- v_rightColumnN = _mm_shufflelo_epi16(v_rightColumn, ((Y) & 3) * 0x55); \
- v_rightColumnN = _mm_unpacklo_epi64(v_rightColumnN, v_rightColumnN); \
- } \
- else { \
- v_horPred = _mm_shufflehi_epi16(v_horPred4, ((Y) & 3) * 0x55); \
- v_horPred = _mm_unpackhi_epi64(v_horPred, v_horPred); \
- v_rightColumnN = _mm_shufflehi_epi16(v_rightColumn, ((Y) & 3) * 0x55); \
- v_rightColumnN = _mm_unpackhi_epi64(v_rightColumnN, v_rightColumnN); \
- } \
+ BROADCAST16(v_horPred4, (Y), v_horPred); \
+ BROADCAST16(v_rightColumn, (Y), v_rightColumnN); \
v_rightColumnN = _mm_mullo_epi16(v_rightColumnN, v_multiL); \
v_horPred = _mm_add_epi16(v_horPred, v_rightColumnN); \
v_topRow = _mm_add_epi16(v_topRow, v_bottomRow); \
@@ -1148,29 +1119,22 @@
v_im4_hi.store(&dst[X * dstStride + 8]); \
}
-void intra_pred_planar16(pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride)
+void intra_pred_planar16(pixel* above, pixel* left, pixel* dst, intptr_t dstStride)
{
- int k;
pixel bottomLeft, topRight;
- int16_t leftColumn[16];
// Get left and above reference column and row
Vec8s v_topRow_lo, v_topRow_hi;
- v_topRow_lo.load(&src[-srcStride]);
- v_topRow_hi.load(&src[-srcStride + 8]);
-
- for (k = 0; k < 16; k++)
- {
- leftColumn[k] = src[k * srcStride - 1];
- }
+ v_topRow_lo.load(&above[0]);
+ v_topRow_hi.load(&above[8]);
Vec8s v_leftColumn;
- v_leftColumn.load(leftColumn); // leftColumn
+ v_leftColumn.load(left); // leftColumn
// Prepare intermediate variables used in interpolation
- bottomLeft = src[16 * srcStride - 1];
- topRight = src[16 - srcStride];
+ bottomLeft = left[16];
+ topRight = above[16];
Vec8s v_bottomLeft(bottomLeft);
Vec8s v_topRight(topRight);
@@ -1200,7 +1164,7 @@
COMP_PRED_PLANAR_ROW(6);
COMP_PRED_PLANAR_ROW(7); // row 7
- v_leftColumn.load(leftColumn + 8); // leftColumn lower 8 rows
+ v_leftColumn.load(left + 8); // leftColumn lower 8 rows
v_rightColumn = v_topRight - v_leftColumn;
v_leftColumn = v_leftColumn << (2 + shift);
v_horPred4 = v_leftColumn + Vec8s(16);
@@ -1235,28 +1199,20 @@
store_partial(const_int(16), &dst[X * dstStride], v_im5); \
}
-void intra_pred_planar16(pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride)
+void intra_pred_planar16(pixel* above, pixel* left, pixel* dst, intptr_t dstStride)
{
- int k;
pixel bottomLeft, topRight;
- int16_t leftColumn[16];
// Get left and above reference column and row
- Vec16uc im0 = (Vec16uc)load_partial(const_int(16), &src[-srcStride]); // topRow
+ Vec16uc im0 = (Vec16uc)load_partial(const_int(16), above); // topRow
Vec8s v_topRow_lo = extend_low(im0);
Vec8s v_topRow_hi = extend_high(im0);
- for (k = 0; k < 16; k++)
- {
- leftColumn[k] = src[k * srcStride - 1];
- }
-
- Vec8s v_leftColumn;
- v_leftColumn.load(leftColumn); // leftColumn
+ Vec8s v_leftColumn = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)left), _mm_setzero_si128());
// Prepare intermediate variables used in interpolation
- bottomLeft = src[16 * srcStride - 1];
- topRight = src[16 - srcStride];
+ bottomLeft = left[16];
+ topRight = above[16];
Vec8s v_bottomLeft(bottomLeft);
Vec8s v_topRight(topRight);
@@ -1286,7 +1242,8 @@
COMP_PRED_PLANAR_ROW(6);
COMP_PRED_PLANAR_ROW(7); // row 7
- v_leftColumn.load(leftColumn + 8); // leftColumn lower 8 rows
+ // leftColumn lower 8 rows
+ v_leftColumn = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(left + 8)), _mm_setzero_si128());
v_rightColumn = v_topRight - v_leftColumn;
v_leftColumn = v_leftColumn << (2 + shift);
v_horPred4 = v_leftColumn + Vec8s(16);
@@ -1304,21 +1261,21 @@
#undef COMP_PRED_PLANAR_ROW
#if INSTRSET >= 5
-void intra_pred_planar16_sse4(pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride)
+void intra_pred_planar16_sse4(pixel* above, pixel* left, pixel* dst, intptr_t dstStride)
{
pixel bottomLeft, topRight;
__m128i v_topRow[2];
__m128i v_bottomRow[2];
// Get left and above reference column and row
- __m128i im0 = _mm_loadu_si128((__m128i*)&src[0 - srcStride]); // topRow
+ __m128i im0 = _mm_loadu_si128((__m128i*)above); // topRow
v_topRow[0] = _mm_unpacklo_epi8(im0, _mm_setzero_si128());
v_topRow[1] = _mm_unpackhi_epi8(im0, _mm_setzero_si128());
// Prepare intermediate variables used in interpolation
- bottomLeft = src[16 * srcStride - 1];
- topRight = src[16 - srcStride];
+ bottomLeft = left[16];
+ topRight = above[16];
__m128i v_bottomLeft = _mm_set1_epi16(bottomLeft);
@@ -1333,10 +1290,10 @@
__m128i v_im5;
#define COMP_PRED_PLANAR_ROW(Y) { \
- v_horPred = _mm_cvtsi32_si128((src[(Y)*srcStride - 1] << 4) + 16); \
+ v_horPred = _mm_cvtsi32_si128((left[(Y)] << 4) + 16); \
v_horPred = _mm_shufflelo_epi16(v_horPred, 0); \
v_horPred = _mm_shuffle_epi32(v_horPred, 0); \
- __m128i _tmp = _mm_cvtsi32_si128(topRight - src[(Y)*srcStride - 1]); \
+ __m128i _tmp = _mm_cvtsi32_si128(topRight - left[(Y)]); \
_tmp = _mm_shufflelo_epi16(_tmp, 0); \
_tmp = _mm_shuffle_epi32(_tmp, 0); \
v_rightColumnN[0] = _mm_mullo_epi16(_tmp, v_multiL); \
@@ -1374,15 +1331,15 @@
#endif // INSTRSET >= 5
#if INSTRSET >= 5
-void intra_pred_planar32_sse4(pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride)
+void intra_pred_planar32_sse4(pixel* above, pixel* left, pixel* dst, intptr_t dstStride)
{
pixel bottomLeft, topRight;
__m128i v_topRow[4];
__m128i v_bottomRow[4];
// Get left and above reference column and row
- __m128i im0 = _mm_loadu_si128((__m128i*)&src[0 - srcStride]); // topRow
- __m128i im1 = _mm_loadu_si128((__m128i*)&src[16 - srcStride]); // topRow
+ __m128i im0 = _mm_loadu_si128((__m128i*)&above[0]); // topRow
+ __m128i im1 = _mm_loadu_si128((__m128i*)&above[16]); // topRow
v_topRow[0] = _mm_unpacklo_epi8(im0, _mm_setzero_si128());
v_topRow[1] = _mm_unpackhi_epi8(im0, _mm_setzero_si128());
@@ -1390,8 +1347,8 @@
v_topRow[3] = _mm_unpackhi_epi8(im1, _mm_setzero_si128());
// Prepare intermediate variables used in interpolation
- bottomLeft = src[32 * srcStride - 1];
- topRight = src[32 - srcStride];
+ bottomLeft = left[32];
+ topRight = above[32];
__m128i v_bottomLeft = _mm_set1_epi16(bottomLeft);
@@ -1410,10 +1367,10 @@
__m128i v_im5[2];
#define COMP_PRED_PLANAR_ROW(Y) { \
- v_horPred = _mm_cvtsi32_si128((src[(Y)*srcStride - 1] << 5) + 32); \
+ v_horPred = _mm_cvtsi32_si128((left[(Y)] << 5) + 32); \
v_horPred = _mm_shufflelo_epi16(v_horPred, 0); \
v_horPred = _mm_shuffle_epi32(v_horPred, 0); \
- __m128i _tmp = _mm_cvtsi32_si128(topRight - src[(Y)*srcStride - 1]); \
+ __m128i _tmp = _mm_cvtsi32_si128(topRight - left[(Y)]); \
_tmp = _mm_shufflelo_epi16(_tmp, 0); \
_tmp = _mm_shuffle_epi32(_tmp, 0); \
v_rightColumnN[0] = _mm_mullo_epi16(_tmp, v_multiL); \
@@ -1451,17 +1408,17 @@
#endif // INSTRSET >= 5
#if INSTRSET >= 5
-void intra_pred_planar64_sse4(pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride)
+void intra_pred_planar64_sse4(pixel* above, pixel* left, pixel* dst, intptr_t dstStride)
{
pixel bottomLeft, topRight;
__m128i v_topRow[8];
__m128i v_bottomRow[8];
// Get left and above reference column and row
- __m128i im0 = _mm_loadu_si128((__m128i*)&src[0 - srcStride]); // topRow
- __m128i im1 = _mm_loadu_si128((__m128i*)&src[16 - srcStride]); // topRow
- __m128i im2 = _mm_loadu_si128((__m128i*)&src[32 - srcStride]); // topRow
- __m128i im3 = _mm_loadu_si128((__m128i*)&src[48 - srcStride]); // topRow
+ __m128i im0 = _mm_loadu_si128((__m128i*)&above[0]); // topRow
+ __m128i im1 = _mm_loadu_si128((__m128i*)&above[16]); // topRow
+ __m128i im2 = _mm_loadu_si128((__m128i*)&above[32]); // topRow
+ __m128i im3 = _mm_loadu_si128((__m128i*)&above[48]); // topRow
v_topRow[0] = _mm_unpacklo_epi8(im0, _mm_setzero_si128());
v_topRow[1] = _mm_unpackhi_epi8(im0, _mm_setzero_si128());
@@ -1473,8 +1430,8 @@
v_topRow[7] = _mm_unpackhi_epi8(im3, _mm_setzero_si128());
// Prepare intermediate variables used in interpolation
- bottomLeft = src[64 * srcStride - 1];
- topRight = src[64 - srcStride];
+ bottomLeft = left[64];
+ topRight = above[64];
__m128i v_bottomLeft = _mm_set1_epi16(bottomLeft);
@@ -1501,10 +1458,10 @@
__m128i v_im5[4];
#define COMP_PRED_PLANAR_ROW(Y) { \
- v_horPred = _mm_cvtsi32_si128((src[(Y)*srcStride - 1] << 6) + 64); \
+ v_horPred = _mm_cvtsi32_si128((left[(Y)] << 6) + 64); \
v_horPred = _mm_shufflelo_epi16(v_horPred, 0); \
v_horPred = _mm_shuffle_epi32(v_horPred, 0); \
- __m128i _tmp = _mm_cvtsi32_si128(topRight - src[(Y)*srcStride - 1]); \
+ __m128i _tmp = _mm_cvtsi32_si128(topRight - left[(Y)]); \
_tmp = _mm_shufflelo_epi16(_tmp, 0); \
_tmp = _mm_shuffle_epi32(_tmp, 0); \
v_rightColumnN[0] = _mm_mullo_epi16(_tmp, v_multiL); \
@@ -1563,7 +1520,7 @@
#endif /* if HIGH_BIT_DEPTH */
-typedef void intra_pred_planar_t (pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride);
+typedef void intra_pred_planar_t (pixel* above, pixel* left, pixel* dst, intptr_t dstStride);
intra_pred_planar_t *intraPlanarN[] =
{
#if !HIGH_BIT_DEPTH && INSTRSET >= 5
@@ -1579,18 +1536,18 @@
#endif
};
-void intra_pred_planar(pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width)
+void intra_pred_planar(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int width)
{
int nLog2Size = g_convertToBit[width] + 2;
#if (!HIGH_BIT_DEPTH) && (INSTRSET >= 5)
- intraPlanarN[nLog2Size - 2](src, srcStride, dst, dstStride);
+ intraPlanarN[nLog2Size - 2](above, left, dst, dstStride);
return;
#else
int k, l, bottomLeft, topRight;
int horPred;
// OPT_ME: when width is 64, the shift1D is 8, then the dynamic range is [-65280, 65280], so we have to use 32 bits here
- int32_t leftColumn[MAX_CU_SIZE + 1], topRow[MAX_CU_SIZE + 1];
+ int32_t leftColumn[MAX_CU_SIZE], topRow[MAX_CU_SIZE];
// CHECK_ME: dynamic range is 9 bits or 15 bits(I assume max input bit_depth is 14 bits)
int16_t bottomRow[MAX_CU_SIZE], rightColumn[MAX_CU_SIZE];
int blkSize = width;
@@ -1600,20 +1557,20 @@
if (width < 32)
{
- intraPlanarN[nLog2Size - 2](src, srcStride, dst, dstStride);
+ intraPlanarN[nLog2Size - 2](above, left, dst, dstStride);
return;
}
// Get left and above reference column and row
- for (k = 0; k < blkSize + 1; k++)
+ for (k = 0; k < blkSize; k++)
{
- topRow[k] = src[k - srcStride];
- leftColumn[k] = src[k * srcStride - 1];
+ topRow[k] = above[k];
+ leftColumn[k] = left[k];
}
// Prepare intermediate variables used in interpolation
- bottomLeft = leftColumn[blkSize];
- topRight = topRow[blkSize];
+ bottomLeft = left[blkSize];
+ topRight = above[blkSize];
for (k = 0; k < blkSize; k++)
{
bottomRow[k] = bottomLeft - topRow[k];
diff -r f813f110d69a -r a4013cdafef0 source/encoder/compress.cpp
--- a/source/encoder/compress.cpp Thu Jul 18 02:10:37 2013 -0500
+++ b/source/encoder/compress.cpp Thu Jul 18 15:52:51 2013 +0800
@@ -123,23 +123,25 @@
CandNum = 0;
UInt modeCosts[35];
Bool bFilter = (width <= 16);
- Pel *src = m_search->getPredicBuf();
Pel *pAbove0 = m_search->refAbove + width - 1;
Pel *pAbove1 = m_search->refAboveFlt + width - 1;
Pel *pLeft0 = m_search->refLeft + width - 1;
Pel *pLeft1 = m_search->refLeftFlt + width - 1;
+ Pel *pAbove = pAbove0;
+ Pel *pLeft = pLeft0;
// 1
- primitives.intra_pred_dc(pAbove0 + 1, pLeft0 + 1, pred, stride, width, bFilter);
+ primitives.intra_pred_dc((pixel*)pAbove0 + 1, (pixel*)pLeft0 + 1, pred, stride, width, bFilter);
modeCosts[DC_IDX] = sa8d(fenc, stride, pred, stride);
// 0
if (width >= 8 && width <= 32)
{
- src += ADI_BUF_STRIDE * (2 * width + 1);
+ pAbove = pAbove1;
+ pLeft = pLeft1;
}
- primitives.intra_pred_planar(src + ADI_BUF_STRIDE + 1, ADI_BUF_STRIDE, pred, stride, width);
+ primitives.intra_pred_planar(pAbove + 1, pLeft + 1, pred, stride, width);
modeCosts[PLANAR_IDX] = sa8d(fenc, stride, pred, stride);
// 33 Angle modes once
diff -r f813f110d69a -r a4013cdafef0 source/test/intrapredharness.cpp
--- a/source/test/intrapredharness.cpp Thu Jul 18 02:10:37 2013 -0500
+++ b/source/test/intrapredharness.cpp Thu Jul 18 15:52:51 2013 +0800
@@ -117,17 +117,26 @@
{
for (int i = 0; i <= 100; i++)
{
+ pixel left[MAX_CU_SIZE * 2 + 1];
+ for (int k = 0; k < width * 2 + 1; k++)
+ {
+ left[k] = pixel_buff[j - 1 + k * ADI_BUF_STRIDE];
+ }
#if _DEBUG
memset(pixel_out_vec, 0xCD, out_size);
memset(pixel_out_c, 0xCD, out_size);
#endif
- ref(pixel_buff + j, ADI_BUF_STRIDE, pixel_out_c, FENC_STRIDE, width);
- opt(pixel_buff + j, ADI_BUF_STRIDE, pixel_out_vec, FENC_STRIDE, width);
+ ref(pixel_buff + j - ADI_BUF_STRIDE, left + 1, pixel_out_c, FENC_STRIDE, width);
+ opt(pixel_buff + j - ADI_BUF_STRIDE, left + 1, pixel_out_vec, FENC_STRIDE, width);
for (int k = 0; k < width; k++)
{
if (memcmp(pixel_out_vec + k * FENC_STRIDE, pixel_out_c + k * FENC_STRIDE, width))
{
+#if _DEBUG
+ ref(pixel_buff + j - ADI_BUF_STRIDE, left + 1, pixel_out_c, FENC_STRIDE, width);
+ opt(pixel_buff + j - ADI_BUF_STRIDE, left + 1, pixel_out_vec, FENC_STRIDE, width);
+#endif
return false;
}
}
@@ -292,7 +301,7 @@
width = ii;
printf("intra_planar%2dx%d", ii, ii);
REPORT_SPEEDUP(opt.intra_pred_planar, ref.intra_pred_planar,
- pixel_buff + srcStride, srcStride, pixel_out_vec, FENC_STRIDE, width);
+ pixel_buff + srcStride, pixel_buff, pixel_out_vec, FENC_STRIDE, width);
}
}
if (opt.intra_pred_ang)
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20130718/1244b81a/attachment-0001.html>
More information about the x265-devel
mailing list