[x265-commits] [x265] pixel: fix avx2, 16bpp, and testbench following luma enum...
Steve Borho
steve at borho.org
Fri Oct 18 04:45:01 CEST 2013
details: http://hg.videolan.org/x265/rev/39ceb9570c5d
branches:
changeset: 4530:39ceb9570c5d
user: Steve Borho <steve at borho.org>
date: Thu Oct 17 21:00:34 2013 -0500
description:
pixel: fix avx2, 16bpp, and testbench following luma enum reorg
Subject: [x265] intra: remove unused argument to predDCFiltering(), remove static
details: http://hg.videolan.org/x265/rev/d05cf1a4d3a5
branches:
changeset: 4531:d05cf1a4d3a5
user: Steve Borho <steve at borho.org>
date: Thu Oct 17 20:41:19 2013 -0500
description:
intra: remove unused argument to predDCFiltering(), remove static
There's no need to declare the function static, it is within an anonymous
namespace
Subject: [x265] intra: segregate 8bpp from 16bpp functions, drop 16bpp angular, drop 64x64
details: http://hg.videolan.org/x265/rev/5ab2da8320f5
branches:
changeset: 4532:5ab2da8320f5
user: Steve Borho <steve at borho.org>
date: Thu Oct 17 21:28:02 2013 -0500
description:
intra: segregate 8bpp from 16bpp functions, drop 16bpp angular, drop 64x64
The HIGH_BIT_DEPTH angular function was just a copy of the C reference, we do
not need 64x64 blocks any more
diffstat:
source/common/vec/intra-sse3.cpp | 3106 +++++++++++++++-------------------
source/common/vec/pixel-avx2.cpp | 6 +-
source/common/vec/pixel16-sse41.cpp | 6 +-
source/test/intrapredharness.cpp | 4 +-
source/test/testbench.cpp | 18 +
source/test/testharness.h | 19 +-
6 files changed, 1442 insertions(+), 1717 deletions(-)
diffs (truncated from 3577 to 300 lines):
diff -r b42f1963229b -r 5ab2da8320f5 source/common/vec/intra-sse3.cpp
--- a/source/common/vec/intra-sse3.cpp Thu Oct 17 21:10:34 2013 +0530
+++ b/source/common/vec/intra-sse3.cpp Thu Oct 17 21:28:02 2013 -0500
@@ -37,139 +37,6 @@
using namespace x265;
-extern unsigned char IntraFilterType[][35];
-
-#define PRED_INTRA_ANGLE_4_START() \
- __m128i row11, row12, row21, row22, row31, row32, row41, row42; \
- __m128i tmp16_1, tmp16_2, tmp2, deltaFract; \
- __m128i deltaPos = _mm_set1_epi16(0); \
- __m128i ipAngle = _mm_set1_epi16(0); \
- __m128i thirty1 = _mm_set1_epi16(31); \
- __m128i thirty2 = _mm_set1_epi16(32); \
- bool modeHor = (dirMode < 18);
-
-#define PRED_INTRA_ANGLE_4_END() \
- deltaFract = _mm_and_si128(deltaPos, thirty1); \
- __m128i mullo = _mm_mullo_epi16(row11, _mm_sub_epi16(thirty2, deltaFract)); \
- __m128i sum = _mm_add_epi16(_mm_set1_epi16(16), _mm_mullo_epi16(deltaFract, row12)); \
- row11 = _mm_sra_epi16(_mm_add_epi16(mullo, sum), _mm_cvtsi32_si128(5)); \
- \
- deltaPos = _mm_add_epi16(deltaPos, ipAngle); \
- deltaFract = _mm_and_si128(deltaPos, thirty1); \
- mullo = _mm_mullo_epi16(row21, _mm_sub_epi16(thirty2, deltaFract)); \
- sum = _mm_add_epi16(_mm_set1_epi16(16), _mm_mullo_epi16(deltaFract, row22)); \
- row21 = _mm_sra_epi16(_mm_add_epi16(mullo, sum), _mm_cvtsi32_si128(5)); \
- \
- deltaPos = _mm_add_epi16(deltaPos, ipAngle); \
- deltaFract = _mm_and_si128(deltaPos, thirty1); \
- mullo = _mm_mullo_epi16(row31, _mm_sub_epi16(thirty2, deltaFract)); \
- sum = _mm_add_epi16(_mm_set1_epi16(16), _mm_mullo_epi16(deltaFract, row32)); \
- row31 = _mm_sra_epi16(_mm_add_epi16(mullo, sum), _mm_cvtsi32_si128(5)); \
- \
- deltaPos = _mm_add_epi16(deltaPos, ipAngle); \
- deltaFract = _mm_and_si128(deltaPos, thirty1); \
- mullo = _mm_mullo_epi16(row41, _mm_sub_epi16(thirty2, deltaFract)); \
- sum = _mm_add_epi16(_mm_set1_epi16(16), _mm_mullo_epi16(deltaFract, row42)); \
- row41 = _mm_sra_epi16(_mm_add_epi16(mullo, sum), _mm_cvtsi32_si128(5)); \
- \
- if (modeHor) \
- { \
- __m128i _tmp1, _tmp2, _tmp3, _tmp4; \
- \
- _tmp1 = _mm_unpacklo_epi16(row11, row31); \
- _tmp2 = _mm_unpacklo_epi16(row21, row41); \
- _tmp3 = _mm_unpacklo_epi16(_tmp1, _tmp2); \
- _tmp4 = _mm_unpackhi_epi16(_tmp1, _tmp2); \
- \
- tmp16_1 = _mm_packus_epi16(_tmp3, _tmp3); \
- *(uint32_t*)(dst) = _mm_cvtsi128_si32(tmp16_1); \
- _tmp2 = tmp16_1; \
- _tmp2 = _mm_srl_epi64(_tmp2, _mm_cvtsi32_si128(32)); \
- *(uint32_t*)(dst + dstStride) = _mm_cvtsi128_si32(_tmp2); \
- tmp16_1 = _mm_packus_epi16(_tmp4, _tmp4); \
- *(uint32_t*)(dst + (2 * dstStride)) = _mm_cvtsi128_si32(tmp16_1); \
- _tmp2 = tmp16_1; \
- _tmp2 = _mm_srl_epi64(_tmp2, _mm_cvtsi32_si128(32)); \
- *(uint32_t*)(dst + (3 * dstStride)) = _mm_cvtsi128_si32(_tmp2); \
- } \
- else \
- { \
- *(uint32_t*)(dst) = _mm_cvtsi128_si32(_mm_packus_epi16(row11, row11)); \
- *(uint32_t*)(dst + dstStride) = _mm_cvtsi128_si32(_mm_packus_epi16(row21, row21)); \
- *(uint32_t*)(dst + (2 * dstStride)) = _mm_cvtsi128_si32(_mm_packus_epi16(row31, row31)); \
- *(uint32_t*)(dst + (3 * dstStride)) = _mm_cvtsi128_si32(_mm_packus_epi16(row41, row41)); \
- }
-
-#define PRED_INTRA_ANG8_START() \
- /* Map the mode index to main prediction direction and angle*/ \
- bool modeHor = (dirMode < 18); \
- bool modeVer = !modeHor; \
- int intraPredAngle = modeVer ? (int)dirMode - VER_IDX : modeHor ? -((int)dirMode - HOR_IDX) : 0; \
- int absAng = abs(intraPredAngle); \
- int signAng = intraPredAngle < 0 ? -1 : 1; \
- /* Set bitshifts and scale the angle parameter to block size*/ \
- int angTable[9] = { 0, 2, 5, 9, 13, 17, 21, 26, 32 }; \
- absAng = angTable[absAng]; \
- intraPredAngle = signAng * absAng; \
- if (modeHor) /* Near horizontal modes*/ \
- { \
- Vec16uc tmp; \
- Vec8s row11, row12; \
- Vec16uc row1, row2, row3, row4, tmp16_1, tmp16_2; \
- Vec8s v_deltaFract, v_deltaPos, thirty2(32), thirty1(31), v_ipAngle; \
- Vec8s tmp1, tmp2; \
- v_deltaPos = 0; \
- v_ipAngle = intraPredAngle; \
-
-#define PRED_INTRA_ANG8_MIDDLE() \
- /* Flip the block */ \
- tmp16_1 = blend16uc<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(row1, row2); \
- tmp16_2 = blend16uc<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(row1, row2); \
- row1 = tmp16_1; \
- row2 = tmp16_2; \
- tmp16_1 = blend16uc<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(row3, row4); \
- tmp16_2 = blend16uc<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(row3, row4); \
- row3 = tmp16_1; \
- row4 = tmp16_2; \
- tmp16_1 = blend16uc<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(row1, row2); \
- tmp16_2 = blend16uc<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(row1, row2); \
- row1 = tmp16_1; \
- row2 = tmp16_2; \
- tmp16_1 = blend16uc<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(row3, row4); \
- tmp16_2 = blend16uc<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(row3, row4); \
- row3 = tmp16_1; \
- row4 = tmp16_2; \
- tmp16_1 = blend4i<0, 4, 1, 5>((Vec4i)row1, (Vec4i)row3); \
- tmp16_2 = blend4i<2, 6, 3, 7>((Vec4i)row1, (Vec4i)row3); \
- row1 = tmp16_1; \
- row3 = tmp16_2; \
- tmp16_1 = blend4i<0, 4, 1, 5>((Vec4i)row2, (Vec4i)row4); \
- tmp16_2 = blend4i<2, 6, 3, 7>((Vec4i)row2, (Vec4i)row4); \
- row2 = tmp16_1; \
- row4 = tmp16_2; \
- store_partial(const_int(8), dst, row1); /*row1*/ \
- store_partial(const_int(8), dst + (2 * dstStride), row3); /*row3*/ \
- store_partial(const_int(8), dst + (4 * dstStride), row2); /*row5*/ \
- store_partial(const_int(8), dst + (6 * dstStride), row4); /*row7*/ \
- row1 = blend2q<1, 3>((Vec2q)row1, (Vec2q)row1); \
- store_partial(const_int(8), dst + (1 * dstStride), row1); /*row2*/ \
- row1 = blend2q<1, 3>((Vec2q)row3, (Vec2q)row3); \
- store_partial(const_int(8), dst + (3 * dstStride), row1); /*row4*/ \
- row1 = blend2q<1, 3>((Vec2q)row2, (Vec2q)row2); \
- store_partial(const_int(8), dst + (5 * dstStride), row1); /*row6*/ \
- row1 = blend2q<1, 3>((Vec2q)row4, (Vec2q)row4); \
- store_partial(const_int(8), dst + (7 * dstStride), row1); /*row8*/ \
- } \
- else /* Vertical modes*/ \
- { \
- Vec8s row11, row12; \
- Vec8s v_deltaFract, v_deltaPos, thirty2(32), thirty1(31), v_ipAngle; \
- Vec16uc tmp; \
- Vec8s tmp1, tmp2; \
- v_deltaPos = 0; \
- v_ipAngle = intraPredAngle; \
-
-
namespace {
const int angAP[17][64] =
{
@@ -228,8 +95,8 @@ const int angAP[17][64] =
#define GETAP(X, Y) angAP[8 - (X)][(Y)]
-static inline
-void predDCFiltering(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int width, int /*height*/)
+#if !HIGH_BIT_DEPTH
+inline void predDCFiltering(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int width)
{
int y;
pixel pixDC = *dst;
@@ -240,86 +107,6 @@ void predDCFiltering(pixel* above, pixel
Vec8us im1(pixDCx3);
Vec8us im2, im3;
-#if HIGH_BIT_DEPTH
- switch (width)
- {
- case 4:
- im2 = load_partial(const_int(8), &above[1]);
- im2 = (im1 + im2) >> const_int(2);
- store_partial(const_int(8), &dst[1], im2);
- break;
-
- case 8:
- im2.load(&above[1]);
- im2 = (im1 + im2) >> const_int(2);
- im2.store(&dst[1]);
- break;
-
- case 16:
- im2.load(&above[1]);
- im2 = (im1 + im2) >> const_int(2);
- im2.store(&dst[1]);
-
- im2.load(&above[1 + 8]);
- im2 = (im1 + im2) >> const_int(2);
- im2.store(&dst[1 + 8]);
- break;
-
- case 32:
- im2.load(&above[1]);
- im2 = (im1 + im2) >> const_int(2);
- im2.store(&dst[1]);
-
- im2.load(&above[1 + 8]);
- im2 = (im1 + im2) >> const_int(2);
- im2.store(&dst[1 + 8]);
-
- im2.load(&above[1 + 16]);
- im2 = (im1 + im2) >> const_int(2);
- im2.store(&dst[1 + 16]);
-
- im2.load(&above[1 + 24]);
- im2 = (im1 + im2) >> const_int(2);
- im2.store(&dst[1 + 24]);
- break;
-
- //case 64:
- default:
- im2.load(&above[1]);
- im2 = (im1 + im2) >> const_int(2);
- im2.store(&dst[1]);
-
- im2.load(&above[1 + 8]);
- im2 = (im1 + im2) >> const_int(2);
- im2.store(&dst[1 + 8]);
-
- im2.load(&above[1 + 16]);
- im2 = (im1 + im2) >> const_int(2);
- im2.store(&dst[1 + 16]);
-
- im2.load(&above[1 + 24]);
- im2 = (im1 + im2) >> const_int(2);
- im2.store(&dst[1 + 24]);
-
- im2.load(&above[1 + 32]);
- im2 = (im1 + im2) >> const_int(2);
- im2.store(&dst[1 + 32]);
-
- im2.load(&above[1 + 40]);
- im2 = (im1 + im2) >> const_int(2);
- im2.store(&dst[1 + 40]);
-
- im2.load(&above[1 + 48]);
- im2 = (im1 + im2) >> const_int(2);
- im2.store(&dst[1 + 48]);
-
- im2.load(&above[1 + 56]);
- im2 = (im1 + im2) >> const_int(2);
- im2.store(&dst[1 + 56]);
- break;
- }
-
-#else /* if HIGH_BIT_DEPTH */
Vec16uc pix;
switch (width)
{
@@ -366,45 +153,8 @@ void predDCFiltering(pixel* above, pixel
pix = compress(im2, im3);
pix.store(&dst[1 + 16]);
break;
-
- //case 64:
- default:
- pix.load(&above[1]);
- im2 = extend_low(pix);
- im3 = extend_high(pix);
- im2 = (im1 + im2) >> const_int(2);
- im3 = (im1 + im3) >> const_int(2);
- pix = compress(im2, im3);
- pix.store(&dst[1]);
-
- pix.load(&above[1 + 16]);
- im2 = extend_low(pix);
- im3 = extend_high(pix);
- im2 = (im1 + im2) >> const_int(2);
- im3 = (im1 + im3) >> const_int(2);
- pix = compress(im2, im3);
- pix.store(&dst[1 + 16]);
-
- pix.load(&above[1 + 32]);
- im2 = extend_low(pix);
- im3 = extend_high(pix);
- im2 = (im1 + im2) >> const_int(2);
- im3 = (im1 + im3) >> const_int(2);
- pix = compress(im2, im3);
- pix.store(&dst[1 + 32]);
-
- pix.load(&above[1 + 48]);
- im2 = extend_low(pix);
- im3 = extend_high(pix);
- im2 = (im1 + im2) >> const_int(2);
- im3 = (im1 + im3) >> const_int(2);
- pix = compress(im2, im3);
- pix.store(&dst[1 + 48]);
- break;
}
-#endif /* if HIGH_BIT_DEPTH */
-
for (y = 1; y < width; y++)
{
dst[dstStride] = (pixel)((left[y] + pixDCx3) >> 2);
@@ -417,255 +167,58 @@ void intra_pred_dc(pixel* above, pixel*
int sum;
int logSize = g_convertToBit[width] + 2;
-#if HIGH_BIT_DEPTH
- Vec8s sumLeft(0);
- Vec8s sumAbove(0);
- Vec8s m0;
+ Vec16uc pixL, pixT;
+ Vec8us im;
+ Vec4ui im1, im2;
switch (width)
More information about the x265-commits
mailing list