[x265-commits] [x265] pixel: fix avx2, 16bpp, and testbench following luma enum...

Steve Borho steve at borho.org
Fri Oct 18 04:45:01 CEST 2013


details:   http://hg.videolan.org/x265/rev/39ceb9570c5d
branches:  
changeset: 4530:39ceb9570c5d
user:      Steve Borho <steve at borho.org>
date:      Thu Oct 17 21:00:34 2013 -0500
description:
pixel: fix avx2, 16bpp, and testbench following luma enum reorg
Subject: [x265] intra: remove unused argument to predDCFiltering(), remove static

details:   http://hg.videolan.org/x265/rev/d05cf1a4d3a5
branches:  
changeset: 4531:d05cf1a4d3a5
user:      Steve Borho <steve at borho.org>
date:      Thu Oct 17 20:41:19 2013 -0500
description:
intra: remove unused argument to predDCFiltering(), remove static

There's no need to declare the function static, it is within an anonymous
namespace
Subject: [x265] intra: segregate 8bpp from 16bpp functions, drop 16bpp angular, drop 64x64

details:   http://hg.videolan.org/x265/rev/5ab2da8320f5
branches:  
changeset: 4532:5ab2da8320f5
user:      Steve Borho <steve at borho.org>
date:      Thu Oct 17 21:28:02 2013 -0500
description:
intra: segregate 8bpp from 16bpp functions, drop 16bpp angular, drop 64x64

The HIGH_BIT_DEPTH angular function was just a copy of the C reference, we do
not need 64x64 blocks any more

diffstat:

 source/common/vec/intra-sse3.cpp    |  3106 +++++++++++++++-------------------
 source/common/vec/pixel-avx2.cpp    |     6 +-
 source/common/vec/pixel16-sse41.cpp |     6 +-
 source/test/intrapredharness.cpp    |     4 +-
 source/test/testbench.cpp           |    18 +
 source/test/testharness.h           |    19 +-
 6 files changed, 1442 insertions(+), 1717 deletions(-)

diffs (truncated from 3577 to 300 lines):

diff -r b42f1963229b -r 5ab2da8320f5 source/common/vec/intra-sse3.cpp
--- a/source/common/vec/intra-sse3.cpp	Thu Oct 17 21:10:34 2013 +0530
+++ b/source/common/vec/intra-sse3.cpp	Thu Oct 17 21:28:02 2013 -0500
@@ -37,139 +37,6 @@
 
 using namespace x265;
 
-extern unsigned char IntraFilterType[][35];
-
-#define PRED_INTRA_ANGLE_4_START() \
-    __m128i row11, row12, row21, row22, row31, row32, row41, row42; \
-    __m128i tmp16_1, tmp16_2, tmp2, deltaFract; \
-    __m128i deltaPos = _mm_set1_epi16(0); \
-    __m128i ipAngle  = _mm_set1_epi16(0); \
-    __m128i thirty1  = _mm_set1_epi16(31); \
-    __m128i thirty2  = _mm_set1_epi16(32); \
-    bool modeHor     = (dirMode < 18);
-
-#define PRED_INTRA_ANGLE_4_END() \
-    deltaFract = _mm_and_si128(deltaPos, thirty1); \
-    __m128i mullo = _mm_mullo_epi16(row11, _mm_sub_epi16(thirty2, deltaFract)); \
-    __m128i sum = _mm_add_epi16(_mm_set1_epi16(16), _mm_mullo_epi16(deltaFract, row12)); \
-    row11 = _mm_sra_epi16(_mm_add_epi16(mullo, sum), _mm_cvtsi32_si128(5)); \
-         \
-    deltaPos = _mm_add_epi16(deltaPos, ipAngle); \
-    deltaFract = _mm_and_si128(deltaPos, thirty1); \
-    mullo = _mm_mullo_epi16(row21, _mm_sub_epi16(thirty2, deltaFract)); \
-    sum = _mm_add_epi16(_mm_set1_epi16(16), _mm_mullo_epi16(deltaFract, row22)); \
-    row21 = _mm_sra_epi16(_mm_add_epi16(mullo, sum), _mm_cvtsi32_si128(5)); \
-         \
-    deltaPos = _mm_add_epi16(deltaPos, ipAngle); \
-    deltaFract = _mm_and_si128(deltaPos, thirty1); \
-    mullo = _mm_mullo_epi16(row31, _mm_sub_epi16(thirty2, deltaFract)); \
-    sum = _mm_add_epi16(_mm_set1_epi16(16), _mm_mullo_epi16(deltaFract, row32)); \
-    row31 = _mm_sra_epi16(_mm_add_epi16(mullo, sum), _mm_cvtsi32_si128(5)); \
-         \
-    deltaPos = _mm_add_epi16(deltaPos, ipAngle); \
-    deltaFract = _mm_and_si128(deltaPos, thirty1); \
-    mullo = _mm_mullo_epi16(row41, _mm_sub_epi16(thirty2, deltaFract)); \
-    sum = _mm_add_epi16(_mm_set1_epi16(16), _mm_mullo_epi16(deltaFract, row42)); \
-    row41 = _mm_sra_epi16(_mm_add_epi16(mullo, sum), _mm_cvtsi32_si128(5)); \
-         \
-    if (modeHor) \
-    { \
-        __m128i _tmp1, _tmp2, _tmp3, _tmp4; \
-             \
-        _tmp1 = _mm_unpacklo_epi16(row11, row31); \
-        _tmp2 = _mm_unpacklo_epi16(row21, row41); \
-        _tmp3 = _mm_unpacklo_epi16(_tmp1, _tmp2); \
-        _tmp4 = _mm_unpackhi_epi16(_tmp1, _tmp2); \
-             \
-        tmp16_1 = _mm_packus_epi16(_tmp3, _tmp3); \
-        *(uint32_t*)(dst) = _mm_cvtsi128_si32(tmp16_1); \
-        _tmp2 = tmp16_1; \
-        _tmp2 = _mm_srl_epi64(_tmp2, _mm_cvtsi32_si128(32)); \
-        *(uint32_t*)(dst + dstStride) = _mm_cvtsi128_si32(_tmp2); \
-        tmp16_1 = _mm_packus_epi16(_tmp4, _tmp4); \
-        *(uint32_t*)(dst + (2 * dstStride)) = _mm_cvtsi128_si32(tmp16_1); \
-        _tmp2 = tmp16_1; \
-        _tmp2 = _mm_srl_epi64(_tmp2, _mm_cvtsi32_si128(32)); \
-        *(uint32_t*)(dst + (3 * dstStride)) = _mm_cvtsi128_si32(_tmp2); \
-    } \
-    else \
-    { \
-        *(uint32_t*)(dst) = _mm_cvtsi128_si32(_mm_packus_epi16(row11, row11)); \
-        *(uint32_t*)(dst + dstStride) = _mm_cvtsi128_si32(_mm_packus_epi16(row21, row21)); \
-        *(uint32_t*)(dst + (2 * dstStride)) = _mm_cvtsi128_si32(_mm_packus_epi16(row31, row31)); \
-        *(uint32_t*)(dst + (3 * dstStride)) = _mm_cvtsi128_si32(_mm_packus_epi16(row41, row41)); \
-    }
-
-#define PRED_INTRA_ANG8_START() \
-    /* Map the mode index to main prediction direction and angle*/ \
-    bool modeHor       = (dirMode < 18);    \
-    bool modeVer       = !modeHor;  \
-    int intraPredAngle = modeVer ? (int)dirMode - VER_IDX : modeHor ? -((int)dirMode - HOR_IDX) : 0; \
-    int absAng         = abs(intraPredAngle);   \
-    int signAng        = intraPredAngle < 0 ? -1 : 1;   \
-    /* Set bitshifts and scale the angle parameter to block size*/  \
-    int angTable[9]    = { 0,    2,    5,   9,  13,  17,  21,  26,  32 }; \
-    absAng             = angTable[absAng];  \
-    intraPredAngle     = signAng * absAng;  \
-    if (modeHor)         /* Near horizontal modes*/   \
-    { \
-        Vec16uc tmp;    \
-        Vec8s row11, row12; \
-        Vec16uc row1, row2, row3, row4, tmp16_1, tmp16_2;   \
-        Vec8s v_deltaFract, v_deltaPos, thirty2(32), thirty1(31), v_ipAngle;    \
-        Vec8s tmp1, tmp2;   \
-        v_deltaPos = 0; \
-        v_ipAngle = intraPredAngle; \
-
-#define PRED_INTRA_ANG8_MIDDLE()  \
-    /* Flip the block */    \
-    tmp16_1 = blend16uc<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(row1, row2);    \
-    tmp16_2 = blend16uc<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(row1, row2);  \
-    row1 = tmp16_1; \
-    row2 = tmp16_2; \
-    tmp16_1 = blend16uc<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(row3, row4);    \
-    tmp16_2 = blend16uc<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(row3, row4);  \
-    row3 = tmp16_1; \
-    row4 = tmp16_2; \
-    tmp16_1 = blend16uc<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(row1, row2);    \
-    tmp16_2 = blend16uc<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(row1, row2);  \
-    row1 = tmp16_1; \
-    row2 = tmp16_2; \
-    tmp16_1 = blend16uc<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(row3, row4);    \
-    tmp16_2 = blend16uc<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(row3, row4);  \
-    row3 = tmp16_1; \
-    row4 = tmp16_2; \
-    tmp16_1 = blend4i<0, 4, 1, 5>((Vec4i)row1, (Vec4i)row3);    \
-    tmp16_2 = blend4i<2, 6, 3, 7>((Vec4i)row1, (Vec4i)row3);    \
-    row1 = tmp16_1; \
-    row3 = tmp16_2; \
-    tmp16_1 = blend4i<0, 4, 1, 5>((Vec4i)row2, (Vec4i)row4);    \
-    tmp16_2 = blend4i<2, 6, 3, 7>((Vec4i)row2, (Vec4i)row4);    \
-    row2 = tmp16_1; \
-    row4 = tmp16_2; \
-    store_partial(const_int(8), dst, row1);       /*row1*/   \
-    store_partial(const_int(8), dst + (2 * dstStride), row3);       /*row3*/   \
-    store_partial(const_int(8), dst + (4 * dstStride), row2);       /*row5*/   \
-    store_partial(const_int(8), dst + (6 * dstStride), row4);       /*row7*/   \
-    row1 = blend2q<1, 3>((Vec2q)row1, (Vec2q)row1); \
-    store_partial(const_int(8), dst + (1 * dstStride), row1);       /*row2*/   \
-    row1 = blend2q<1, 3>((Vec2q)row3, (Vec2q)row3); \
-    store_partial(const_int(8), dst + (3 * dstStride), row1);       /*row4*/   \
-    row1 = blend2q<1, 3>((Vec2q)row2, (Vec2q)row2);    \
-    store_partial(const_int(8), dst + (5 * dstStride), row1);       /*row6*/   \
-    row1 = blend2q<1, 3>((Vec2q)row4, (Vec2q)row4); \
-    store_partial(const_int(8), dst + (7 * dstStride), row1);       /*row8*/   \
-    }   \
-    else                         /* Vertical modes*/    \
-    { \
-        Vec8s row11, row12; \
-        Vec8s v_deltaFract, v_deltaPos, thirty2(32), thirty1(31), v_ipAngle;    \
-        Vec16uc tmp;    \
-        Vec8s tmp1, tmp2;   \
-        v_deltaPos = 0; \
-        v_ipAngle = intraPredAngle; \
-
-
 namespace {
 const int angAP[17][64] =
 {
@@ -228,8 +95,8 @@ const int angAP[17][64] =
 
 #define GETAP(X, Y) angAP[8 - (X)][(Y)]
 
-static inline
-void predDCFiltering(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int width, int /*height*/)
+#if !HIGH_BIT_DEPTH
+inline void predDCFiltering(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int width)
 {
     int y;
     pixel pixDC = *dst;
@@ -240,86 +107,6 @@ void predDCFiltering(pixel* above, pixel
 
     Vec8us im1(pixDCx3);
     Vec8us im2, im3;
-#if HIGH_BIT_DEPTH
-    switch (width)
-    {
-    case 4:
-        im2 = load_partial(const_int(8), &above[1]);
-        im2 = (im1 + im2) >> const_int(2);
-        store_partial(const_int(8), &dst[1], im2);
-        break;
-
-    case 8:
-        im2.load(&above[1]);
-        im2 = (im1 + im2) >> const_int(2);
-        im2.store(&dst[1]);
-        break;
-
-    case 16:
-        im2.load(&above[1]);
-        im2 = (im1 + im2) >> const_int(2);
-        im2.store(&dst[1]);
-
-        im2.load(&above[1 + 8]);
-        im2 = (im1 + im2) >> const_int(2);
-        im2.store(&dst[1 + 8]);
-        break;
-
-    case 32:
-        im2.load(&above[1]);
-        im2 = (im1 + im2) >> const_int(2);
-        im2.store(&dst[1]);
-
-        im2.load(&above[1 + 8]);
-        im2 = (im1 + im2) >> const_int(2);
-        im2.store(&dst[1 + 8]);
-
-        im2.load(&above[1 + 16]);
-        im2 = (im1 + im2) >> const_int(2);
-        im2.store(&dst[1 + 16]);
-
-        im2.load(&above[1 + 24]);
-        im2 = (im1 + im2) >> const_int(2);
-        im2.store(&dst[1 + 24]);
-        break;
-
-    //case 64:
-    default:
-        im2.load(&above[1]);
-        im2 = (im1 + im2) >> const_int(2);
-        im2.store(&dst[1]);
-
-        im2.load(&above[1 + 8]);
-        im2 = (im1 + im2) >> const_int(2);
-        im2.store(&dst[1 + 8]);
-
-        im2.load(&above[1 + 16]);
-        im2 = (im1 + im2) >> const_int(2);
-        im2.store(&dst[1 + 16]);
-
-        im2.load(&above[1 + 24]);
-        im2 = (im1 + im2) >> const_int(2);
-        im2.store(&dst[1 + 24]);
-
-        im2.load(&above[1 + 32]);
-        im2 = (im1 + im2) >> const_int(2);
-        im2.store(&dst[1 + 32]);
-
-        im2.load(&above[1 + 40]);
-        im2 = (im1 + im2) >> const_int(2);
-        im2.store(&dst[1 + 40]);
-
-        im2.load(&above[1 + 48]);
-        im2 = (im1 + im2) >> const_int(2);
-        im2.store(&dst[1 + 48]);
-
-        im2.load(&above[1 + 56]);
-        im2 = (im1 + im2) >> const_int(2);
-        im2.store(&dst[1 + 56]);
-        break;
-    }
-
-#else /* if HIGH_BIT_DEPTH */
     Vec16uc pix;
     switch (width)
     {
@@ -366,45 +153,8 @@ void predDCFiltering(pixel* above, pixel
         pix = compress(im2, im3);
         pix.store(&dst[1 + 16]);
         break;
-
-    //case 64:
-    default:
-        pix.load(&above[1]);
-        im2 = extend_low(pix);
-        im3 = extend_high(pix);
-        im2 = (im1 + im2) >> const_int(2);
-        im3 = (im1 + im3) >> const_int(2);
-        pix = compress(im2, im3);
-        pix.store(&dst[1]);
-
-        pix.load(&above[1 + 16]);
-        im2 = extend_low(pix);
-        im3 = extend_high(pix);
-        im2 = (im1 + im2) >> const_int(2);
-        im3 = (im1 + im3) >> const_int(2);
-        pix = compress(im2, im3);
-        pix.store(&dst[1 + 16]);
-
-        pix.load(&above[1 + 32]);
-        im2 = extend_low(pix);
-        im3 = extend_high(pix);
-        im2 = (im1 + im2) >> const_int(2);
-        im3 = (im1 + im3) >> const_int(2);
-        pix = compress(im2, im3);
-        pix.store(&dst[1 + 32]);
-
-        pix.load(&above[1 + 48]);
-        im2 = extend_low(pix);
-        im3 = extend_high(pix);
-        im2 = (im1 + im2) >> const_int(2);
-        im3 = (im1 + im3) >> const_int(2);
-        pix = compress(im2, im3);
-        pix.store(&dst[1 + 48]);
-        break;
     }
 
-#endif /* if HIGH_BIT_DEPTH */
-
     for (y = 1; y < width; y++)
     {
         dst[dstStride] = (pixel)((left[y] + pixDCx3) >> 2);
@@ -417,255 +167,58 @@ void intra_pred_dc(pixel* above, pixel* 
     int sum;
     int logSize = g_convertToBit[width] + 2;
 
-#if HIGH_BIT_DEPTH
-    Vec8s sumLeft(0);
-    Vec8s sumAbove(0);
-    Vec8s m0;
+    Vec16uc pixL, pixT;
+    Vec8us  im;
+    Vec4ui  im1, im2;
 
     switch (width)


More information about the x265-commits mailing list