[x265] [PATCH 1 of 2] primitves: 8 bit : PredIntraAng4x4 function table implementation

Steve Borho steve at borho.org
Wed Jun 26 09:26:14 CEST 2013


On Thu, Jun 27, 2013 at 2:37 AM, <mandar at multicorewareinc.com> wrote:

> # HG changeset patch
> # User Mandar Gurav
> # Date 1372240787 25200
> # Node ID e156dc24f05f4c2e6770fde1b46754cce640a96b
> # Parent  7a2555036e8db57557f655f3ed49e38ab6d784dd
> primitves: 8 bit : PredIntraAng4x4 function table implementation
>

folded, tweaked, and pushed



> diff -r 7a2555036e8d -r e156dc24f05f source/common/vec/intrapred.inc
> --- a/source/common/vec/intrapred.inc   Mon Jun 24 22:26:33 2013 -0500
> +++ b/source/common/vec/intrapred.inc   Wed Jun 26 02:59:47 2013 -0700
> @@ -1672,33 +1672,1229 @@
>  }
>
>  #else /* if HIGH_BIT_DEPTH */
> +
> +void PredIntraAng4_32(pixel* pDst, int dstStride, pixel *refMain, int
> dirMode)
> +{
> +    Vec16uc tmp16_1, tmp16_2;
> +    dirMode++;
> +    tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain + 2);
> +    store_partial(const_int(4), pDst, tmp16_1);
> +    tmp16_2 = (Vec16uc)load_partial(const_int(8), refMain + 3);
> +    store_partial(const_int(4), pDst + dstStride, tmp16_2);
> +    tmp16_2 = (Vec16uc)load_partial(const_int(8), refMain + 4);
> +    store_partial(const_int(4), pDst + 2 * dstStride, tmp16_2);
> +    tmp16_2 = (Vec16uc)load_partial(const_int(8), refMain + 5);
> +    store_partial(const_int(4), pDst + 3 * dstStride, tmp16_2);
> +}
> +
> +void PredIntraAng4_26(pixel* pDst, int dstStride, pixel *refMain, int
> dirMode)
> +{
> +    Vec8s row11, row12, row21, row22, row31, row32, row41, row42;
> +    Vec16uc tmp16_1, tmp16_2;
> +    Vec2uq tmp2uq;
> +    Vec8s v_deltaFract, v_deltaPos(0), thirty2(32), thirty1(31),
> v_ipAngle(0);
> +    bool modeHor = (dirMode < 18);
> +    tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain + 1);
> +
> +    row11 = extend_low(tmp16_1);    //offsets(0,1,2,3)
> +
> +    tmp2uq = reinterpret_i(tmp16_1);
> +    tmp2uq = tmp2uq >> 8;
> +    tmp16_2 = reinterpret_i(tmp2uq);
> +    row12 = extend_low(tmp16_2);    //offsets(1,2,3,4)
> +
> +    row21 = row12;
> +
> +    tmp2uq = reinterpret_i(tmp16_1);
> +    tmp2uq = tmp2uq >> 16;
> +    tmp16_2 = reinterpret_i(tmp2uq);
> +    row22 = extend_low(tmp16_2);    //offsets(2,3,4,5)
> +
> +    row31 = row22;
> +    tmp2uq = reinterpret_i(tmp16_1);
> +    tmp2uq = tmp2uq >> 24;
> +    tmp16_2 = reinterpret_i(tmp2uq);
> +    row32 = extend_low(tmp16_2);    //offsets(3,4,5,6)
> +
> +    row41 = row32;
> +    tmp2uq = reinterpret_i(tmp16_1);
> +    tmp2uq = tmp2uq >> 32;
> +    tmp16_2 = reinterpret_i(tmp2uq);
> +    row42 = extend_low(tmp16_2);    //offsets(4,5,6,7)
> +
> +    v_deltaPos = v_ipAngle = 26;
> +
> +    v_deltaFract = v_deltaPos & thirty1;
> +    row11 = ((thirty2 - v_deltaFract) * row11 + (v_deltaFract * row12) +
> 16) >> 5;
> +
> +    //row2
> +    v_deltaPos += v_ipAngle;
> +    v_deltaFract = v_deltaPos & thirty1;
> +    row21 = ((thirty2 - v_deltaFract) * row21 + (v_deltaFract * row22) +
> 16) >> 5;
> +
> +    //row3
> +    v_deltaPos += v_ipAngle;
> +    v_deltaFract = v_deltaPos & thirty1;
> +    row31 = ((thirty2 - v_deltaFract) * row31 + (v_deltaFract * row32) +
> 16) >> 5;
> +
> +    //row4
> +    v_deltaPos += v_ipAngle;
> +    v_deltaFract = v_deltaPos & thirty1;
> +    row41 = ((thirty2 - v_deltaFract) * row41 + (v_deltaFract * row42) +
> 16) >> 5;
> +
> +    // Flip the block
> +    if (modeHor)
> +    {
> +        Vec8s tmp1, tmp2, tmp3, tmp4;
> +
> +        tmp1 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(row11, row31);
> +        tmp2 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(row21, row41);
> +
> +        tmp3 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(tmp1, tmp2);
> +        tmp4 = blend8s<4, 12, 5, 13, 6, 14, 7, 15>(tmp1, tmp2);
> +
> +        tmp16_1 = compress_unsafe(tmp3, tmp3);
> +        store_partial(const_int(4), pDst, tmp16_1);
> +
> +        tmp2uq = reinterpret_i(tmp16_1);
> +        tmp2uq >>= 32;
> +        store_partial(const_int(4), pDst + dstStride, tmp2uq);
> +
> +        tmp16_1 = compress_unsafe(tmp4, tmp4);
> +        store_partial(const_int(4), pDst + (2 * dstStride), tmp16_1);
> +
> +        tmp2uq = reinterpret_i(tmp16_1);
> +        tmp2uq >>= 32;
> +        store_partial(const_int(4), pDst + (3 * dstStride), tmp2uq);
> +    }
> +    else
> +    {
> +        store_partial(const_int(4), pDst, compress_unsafe(row11, row11));
> +        store_partial(const_int(4), pDst + (dstStride),
> compress_unsafe(row21, row21));
> +        store_partial(const_int(4), pDst + (2 * dstStride),
> compress_unsafe(row31, row31));
> +        store_partial(const_int(4), pDst + (3 * dstStride),
> compress_unsafe(row41, row41));
> +    }
> +}
> +
> +void PredIntraAng4_21(pixel* pDst, int dstStride, pixel *refMain, int
> dirMode)
> +{
> +    Vec8s row11, row12, row21, row22, row31, row32, row41, row42;
> +    Vec16uc tmp16_1, tmp16_2;
> +    Vec2uq tmp2uq;
> +    Vec8s v_deltaFract, v_deltaPos(0), thirty2(32), thirty1(31),
> v_ipAngle(0);
> +    bool modeHor = (dirMode < 18);
> +
> +    tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain + 1);
> +
> +    row11 = extend_low(tmp16_1);    //offsets(0,1,2,3)
> +
> +    tmp2uq = reinterpret_i(tmp16_1);
> +    tmp2uq = tmp2uq >> 8;
> +    tmp16_2 = reinterpret_i(tmp2uq);
> +    row12 = extend_low(tmp16_2);    //offsets(1,2,3,4)
> +
> +    row21 = row12;
> +
> +    tmp2uq = reinterpret_i(tmp16_1);
> +    tmp2uq = tmp2uq >> 16;
> +    tmp16_2 = reinterpret_i(tmp2uq);
> +    row22 = extend_low(tmp16_2);    //offsets(2,3,4,5)
> +
> +    row31 = row21;
> +    row32 = row22;
> +
> +    row41 = row22;
> +    tmp2uq = reinterpret_i(tmp16_1);
> +    tmp2uq = tmp2uq >> 24;
> +    tmp16_2 = reinterpret_i(tmp2uq);
> +    row42 = extend_low(tmp16_2);    //offsets(3,4,5,6)
> +
> +    v_deltaPos = v_ipAngle = 21;
> +
> +    v_deltaFract = v_deltaPos & thirty1;
> +    row11 = ((thirty2 - v_deltaFract) * row11 + (v_deltaFract * row12) +
> 16) >> 5;
> +
> +    //row2
> +    v_deltaPos += v_ipAngle;
> +    v_deltaFract = v_deltaPos & thirty1;
> +    row21 = ((thirty2 - v_deltaFract) * row21 + (v_deltaFract * row22) +
> 16) >> 5;
> +
> +    //row3
> +    v_deltaPos += v_ipAngle;
> +    v_deltaFract = v_deltaPos & thirty1;
> +    row31 = ((thirty2 - v_deltaFract) * row31 + (v_deltaFract * row32) +
> 16) >> 5;
> +
> +    //row4
> +    v_deltaPos += v_ipAngle;
> +    v_deltaFract = v_deltaPos & thirty1;
> +    row41 = ((thirty2 - v_deltaFract) * row41 + (v_deltaFract * row42) +
> 16) >> 5;
> +
> +    // Flip the block
> +    if (modeHor)
> +    {
> +        Vec8s tmp1, tmp2, tmp3, tmp4;
> +
> +        tmp1 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(row11, row31);
> +        tmp2 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(row21, row41);
> +
> +        tmp3 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(tmp1, tmp2);
> +        tmp4 = blend8s<4, 12, 5, 13, 6, 14, 7, 15>(tmp1, tmp2);
> +
> +        tmp16_1 = compress_unsafe(tmp3, tmp3);
> +        store_partial(const_int(4), pDst, tmp16_1);
> +
> +        tmp2uq = reinterpret_i(tmp16_1);
> +        tmp2uq >>= 32;
> +        store_partial(const_int(4), pDst + dstStride, tmp2uq);
> +
> +        tmp16_1 = compress_unsafe(tmp4, tmp4);
> +        store_partial(const_int(4), pDst + (2 * dstStride), tmp16_1);
> +
> +        tmp2uq = reinterpret_i(tmp16_1);
> +        tmp2uq >>= 32;
> +        store_partial(const_int(4), pDst + (3 * dstStride), tmp2uq);
> +    }
> +    else
> +    {
> +        store_partial(const_int(4), pDst, compress_unsafe(row11, row11));
> +        store_partial(const_int(4), pDst + (dstStride),
> compress_unsafe(row21, row21));
> +        store_partial(const_int(4), pDst + (2 * dstStride),
> compress_unsafe(row31, row31));
> +        store_partial(const_int(4), pDst + (3 * dstStride),
> compress_unsafe(row41, row41));
> +    }
> +}
> +
> +void PredIntraAng4_17(pixel* pDst, int dstStride, pixel *refMain, int
> dirMode)
> +{
> +    Vec8s row11, row12, row21, row22, row31, row32, row41, row42;
> +    Vec16uc tmp16_1, tmp16_2;
> +    Vec2uq tmp2uq;
> +    Vec8s v_deltaFract, v_deltaPos(0), thirty2(32), thirty1(31),
> v_ipAngle(0);
> +    bool modeHor = (dirMode < 18);
> +
> +    tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain + 1);
> +
> +    row11 = extend_low(tmp16_1);    //offsets(0,1,2,3)
> +
> +    tmp2uq = reinterpret_i(tmp16_1);
> +    tmp2uq = tmp2uq >> 8;
> +    tmp16_2 = reinterpret_i(tmp2uq);
> +    row12 = extend_low(tmp16_2);    //offsets(1,2,3,4)
> +
> +    row21 = row12;
> +
> +    tmp2uq = reinterpret_i(tmp16_1);
> +    tmp2uq = tmp2uq >> 16;
> +    tmp16_2 = reinterpret_i(tmp2uq);
> +    row22 = extend_low(tmp16_2);    //offsets(2,3,4,5)
> +
> +    row31 = row21;
> +    row32 = row22;
> +
> +    row41 = row22;
> +    tmp2uq = reinterpret_i(tmp16_1);
> +    tmp2uq = tmp2uq >> 24;
> +    tmp16_2 = reinterpret_i(tmp2uq);
> +    row42 = extend_low(tmp16_2);    //offsets(3,4,5,6)
> +
> +    v_deltaPos = v_ipAngle = 17;
> +
> +    v_deltaFract = v_deltaPos & thirty1;
> +    row11 = ((thirty2 - v_deltaFract) * row11 + (v_deltaFract * row12) +
> 16) >> 5;
> +
> +    //row2
> +    v_deltaPos += v_ipAngle;
> +    v_deltaFract = v_deltaPos & thirty1;
> +    row21 = ((thirty2 - v_deltaFract) * row21 + (v_deltaFract * row22) +
> 16) >> 5;
> +
> +    //row3
> +    v_deltaPos += v_ipAngle;
> +    v_deltaFract = v_deltaPos & thirty1;
> +    row31 = ((thirty2 - v_deltaFract) * row31 + (v_deltaFract * row32) +
> 16) >> 5;
> +
> +    //row4
> +    v_deltaPos += v_ipAngle;
> +    v_deltaFract = v_deltaPos & thirty1;
> +    row41 = ((thirty2 - v_deltaFract) * row41 + (v_deltaFract * row42) +
> 16) >> 5;
> +
> +    // Flip the block
> +    if (modeHor)
> +    {
> +        Vec8s tmp1, tmp2, tmp3, tmp4;
> +
> +        tmp1 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(row11, row31);
> +        tmp2 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(row21, row41);
> +
> +        tmp3 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(tmp1, tmp2);
> +        tmp4 = blend8s<4, 12, 5, 13, 6, 14, 7, 15>(tmp1, tmp2);
> +
> +        tmp16_1 = compress_unsafe(tmp3, tmp3);
> +        store_partial(const_int(4), pDst, tmp16_1);
> +
> +        tmp2uq = reinterpret_i(tmp16_1);
> +        tmp2uq >>= 32;
> +        store_partial(const_int(4), pDst + dstStride, tmp2uq);
> +
> +        tmp16_1 = compress_unsafe(tmp4, tmp4);
> +        store_partial(const_int(4), pDst + (2 * dstStride), tmp16_1);
> +
> +        tmp2uq = reinterpret_i(tmp16_1);
> +        tmp2uq >>= 32;
> +        store_partial(const_int(4), pDst + (3 * dstStride), tmp2uq);
> +    }
> +    else
> +    {
> +        store_partial(const_int(4), pDst, compress_unsafe(row11, row11));
> +        store_partial(const_int(4), pDst + (dstStride),
> compress_unsafe(row21, row21));
> +        store_partial(const_int(4), pDst + (2 * dstStride),
> compress_unsafe(row31, row31));
> +        store_partial(const_int(4), pDst + (3 * dstStride),
> compress_unsafe(row41, row41));
> +    }
> +}
> +
> +void PredIntraAng4_13(pixel* pDst, int dstStride, pixel *refMain, int
> dirMode)
> +{
> +    Vec8s row11, row12, row21, row22, row31, row32, row41, row42;
> +    Vec16uc tmp16_1, tmp16_2;
> +    Vec2uq tmp2uq;
> +    Vec8s v_deltaFract, v_deltaPos(0), thirty2(32), thirty1(31),
> v_ipAngle(0);
> +    bool modeHor = (dirMode < 18);
> +
> +    tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain + 1);
> +
> +    row11 = extend_low(tmp16_1);    //offsets(0,1,2,3)
> +
> +    tmp2uq = reinterpret_i(tmp16_1);
> +    tmp2uq = tmp2uq >> 8;
> +    tmp16_2 = reinterpret_i(tmp2uq);
> +    row12 = extend_low(tmp16_2);    //offsets(1,2,3,4)
> +
> +    row21 = row11;                  //offsets(0,1,2,3)
> +    row22 = row12;
> +    row31 = row12;                  //offsets(1,2,3,4)
> +
> +    tmp2uq = reinterpret_i(tmp16_1);
> +    tmp2uq = tmp2uq >> 16;
> +    tmp16_2 = reinterpret_i(tmp2uq);
> +    row32 = extend_low(tmp16_2);    //offsets(2,3,4,5)
> +
> +    row41 = row31;                  //offsets(1,2,3,4)
> +    row42 = row32;
> +
> +    v_deltaPos = v_ipAngle = 13;
> +
> +    v_deltaFract = v_deltaPos & thirty1;
> +    row11 = ((thirty2 - v_deltaFract) * row11 + (v_deltaFract * row12) +
> 16) >> 5;
> +
> +    //row2
> +    v_deltaPos += v_ipAngle;
> +    v_deltaFract = v_deltaPos & thirty1;
> +    row21 = ((thirty2 - v_deltaFract) * row21 + (v_deltaFract * row22) +
> 16) >> 5;
> +
> +    //row3
> +    v_deltaPos += v_ipAngle;
> +    v_deltaFract = v_deltaPos & thirty1;
> +    row31 = ((thirty2 - v_deltaFract) * row31 + (v_deltaFract * row32) +
> 16) >> 5;
> +
> +    //row4
> +    v_deltaPos += v_ipAngle;
> +    v_deltaFract = v_deltaPos & thirty1;
> +    row41 = ((thirty2 - v_deltaFract) * row41 + (v_deltaFract * row42) +
> 16) >> 5;
> +
> +    // Flip the block
> +    if (modeHor)
> +    {
> +        Vec8s tmp1, tmp2, tmp3, tmp4;
> +
> +        tmp1 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(row11, row31);
> +        tmp2 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(row21, row41);
> +
> +        tmp3 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(tmp1, tmp2);
> +        tmp4 = blend8s<4, 12, 5, 13, 6, 14, 7, 15>(tmp1, tmp2);
> +
> +        tmp16_1 = compress_unsafe(tmp3, tmp3);
> +        store_partial(const_int(4), pDst, tmp16_1);
> +
> +        tmp2uq = reinterpret_i(tmp16_1);
> +        tmp2uq >>= 32;
> +        store_partial(const_int(4), pDst + dstStride, tmp2uq);
> +
> +        tmp16_1 = compress_unsafe(tmp4, tmp4);
> +        store_partial(const_int(4), pDst + (2 * dstStride), tmp16_1);
> +
> +        tmp2uq = reinterpret_i(tmp16_1);
> +        tmp2uq >>= 32;
> +        store_partial(const_int(4), pDst + (3 * dstStride), tmp2uq);
> +    }
> +    else
> +    {
> +        store_partial(const_int(4), pDst, compress_unsafe(row11, row11));
> +        store_partial(const_int(4), pDst + (dstStride),
> compress_unsafe(row21, row21));
> +        store_partial(const_int(4), pDst + (2 * dstStride),
> compress_unsafe(row31, row31));
> +        store_partial(const_int(4), pDst + (3 * dstStride),
> compress_unsafe(row41, row41));
> +    }
> +}
> +
> +void PredIntraAng4_9(pixel* pDst, int dstStride, pixel *refMain, int
> dirMode)
> +{
> +    Vec8s row11, row12, row21, row22, row31, row32, row41, row42;
> +    Vec16uc tmp16_1, tmp16_2;
> +    Vec2uq tmp2uq;
> +    Vec8s v_deltaFract, v_deltaPos(0), thirty2(32), thirty1(31),
> v_ipAngle(0);
> +    bool modeHor = (dirMode < 18);
> +
> +    tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain + 1);
> +    row11 = extend_low(tmp16_1);    //offsets(0,1,2,3)
> +    tmp2uq = reinterpret_i(tmp16_1);
> +    tmp2uq = tmp2uq >> 8;
> +    tmp16_2 = reinterpret_i(tmp2uq);
> +    row12 = extend_low(tmp16_2);    //offsets(1,2,3,4)
> +    row21 = row11;                  //offsets(0,1,2,3)
> +    row22 = row12;
> +    row31 = row11;
> +    row32 = row12;
> +    row41 = row12;
> +    tmp2uq = reinterpret_i(tmp16_1);
> +    tmp2uq = tmp2uq >> 16;
> +    tmp16_2 = reinterpret_i(tmp2uq);
> +    row42 = extend_low(tmp16_2);
> +
> +    v_deltaPos = v_ipAngle = 9;
> +
> +    v_deltaFract = v_deltaPos & thirty1;
> +    row11 = ((thirty2 - v_deltaFract) * row11 + (v_deltaFract * row12) +
> 16) >> 5;
> +
> +    //row2
> +    v_deltaPos += v_ipAngle;
> +    v_deltaFract = v_deltaPos & thirty1;
> +    row21 = ((thirty2 - v_deltaFract) * row21 + (v_deltaFract * row22) +
> 16) >> 5;
> +
> +    //row3
> +    v_deltaPos += v_ipAngle;
> +    v_deltaFract = v_deltaPos & thirty1;
> +    row31 = ((thirty2 - v_deltaFract) * row31 + (v_deltaFract * row32) +
> 16) >> 5;
> +
> +    //row4
> +    v_deltaPos += v_ipAngle;
> +    v_deltaFract = v_deltaPos & thirty1;
> +    row41 = ((thirty2 - v_deltaFract) * row41 + (v_deltaFract * row42) +
> 16) >> 5;
> +
> +    // Flip the block
> +    if (modeHor)
> +    {
> +        Vec8s tmp1, tmp2, tmp3, tmp4;
> +
> +        tmp1 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(row11, row31);
> +        tmp2 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(row21, row41);
> +
> +        tmp3 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(tmp1, tmp2);
> +        tmp4 = blend8s<4, 12, 5, 13, 6, 14, 7, 15>(tmp1, tmp2);
> +
> +        tmp16_1 = compress_unsafe(tmp3, tmp3);
> +        store_partial(const_int(4), pDst, tmp16_1);
> +
> +        tmp2uq = reinterpret_i(tmp16_1);
> +        tmp2uq >>= 32;
> +        store_partial(const_int(4), pDst + dstStride, tmp2uq);
> +
> +        tmp16_1 = compress_unsafe(tmp4, tmp4);
> +        store_partial(const_int(4), pDst + (2 * dstStride), tmp16_1);
> +
> +        tmp2uq = reinterpret_i(tmp16_1);
> +        tmp2uq >>= 32;
> +        store_partial(const_int(4), pDst + (3 * dstStride), tmp2uq);
> +    }
> +    else
> +    {
> +        store_partial(const_int(4), pDst, compress_unsafe(row11, row11));
> +        store_partial(const_int(4), pDst + (dstStride),
> compress_unsafe(row21, row21));
> +        store_partial(const_int(4), pDst + (2 * dstStride),
> compress_unsafe(row31, row31));
> +        store_partial(const_int(4), pDst + (3 * dstStride),
> compress_unsafe(row41, row41));
> +    }
> +}
> +
> +void PredIntraAng4_5(pixel* pDst, int dstStride, pixel *refMain, int
> dirMode)
> +{
> +    Vec8s row11, row12, row21, row22, row31, row32, row41, row42;
> +    Vec16uc tmp16_1, tmp16_2;
> +    Vec2uq tmp2uq;
> +    Vec8s v_deltaFract, v_deltaPos(0), thirty2(32), thirty1(31),
> v_ipAngle(0);
> +    bool modeHor = (dirMode < 18);
> +
> +    tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain + 1);
> +    row11 = extend_low(tmp16_1);    //offsets(0,1,2,3)
> +    tmp2uq = reinterpret_i(tmp16_1);
> +    tmp2uq = tmp2uq >> 8;
> +    tmp16_2 = reinterpret_i(tmp2uq);
> +    row12 = extend_low(tmp16_2);    //offsets(1,2,3,4)
> +    row21 = row11;                  //offsets(0,1,2,3)
> +    row22 = row12;
> +    row31 = row11;
> +    row32 = row12;
> +    row41 = row11;
> +    row42 = row12;
> +
> +    v_deltaPos = v_ipAngle = 5;
> +
> +    v_deltaFract = v_deltaPos & thirty1;
> +    row11 = ((thirty2 - v_deltaFract) * row11 + (v_deltaFract * row12) +
> 16) >> 5;
> +
> +    //row2
> +    v_deltaPos += v_ipAngle;
> +    v_deltaFract = v_deltaPos & thirty1;
> +    row21 = ((thirty2 - v_deltaFract) * row21 + (v_deltaFract * row22) +
> 16) >> 5;
> +
> +    //row3
> +    v_deltaPos += v_ipAngle;
> +    v_deltaFract = v_deltaPos & thirty1;
> +    row31 = ((thirty2 - v_deltaFract) * row31 + (v_deltaFract * row32) +
> 16) >> 5;
> +
> +    //row4
> +    v_deltaPos += v_ipAngle;
> +    v_deltaFract = v_deltaPos & thirty1;
> +    row41 = ((thirty2 - v_deltaFract) * row41 + (v_deltaFract * row42) +
> 16) >> 5;
> +
> +    // Flip the block
> +    if (modeHor)
> +    {
> +        Vec8s tmp1, tmp2, tmp3, tmp4;
> +
> +        tmp1 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(row11, row31);
> +        tmp2 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(row21, row41);
> +
> +        tmp3 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(tmp1, tmp2);
> +        tmp4 = blend8s<4, 12, 5, 13, 6, 14, 7, 15>(tmp1, tmp2);
> +
> +        tmp16_1 = compress_unsafe(tmp3, tmp3);
> +        store_partial(const_int(4), pDst, tmp16_1);
> +
> +        tmp2uq = reinterpret_i(tmp16_1);
> +        tmp2uq >>= 32;
> +        store_partial(const_int(4), pDst + dstStride, tmp2uq);
> +
> +        tmp16_1 = compress_unsafe(tmp4, tmp4);
> +        store_partial(const_int(4), pDst + (2 * dstStride), tmp16_1);
> +
> +        tmp2uq = reinterpret_i(tmp16_1);
> +        tmp2uq >>= 32;
> +        store_partial(const_int(4), pDst + (3 * dstStride), tmp2uq);
> +    }
> +    else
> +    {
> +        store_partial(const_int(4), pDst, compress_unsafe(row11, row11));
> +        store_partial(const_int(4), pDst + (dstStride),
> compress_unsafe(row21, row21));
> +        store_partial(const_int(4), pDst + (2 * dstStride),
> compress_unsafe(row31, row31));
> +        store_partial(const_int(4), pDst + (3 * dstStride),
> compress_unsafe(row41, row41));
> +    }
> +}
> +
> +void PredIntraAng4_2(pixel* pDst, int dstStride, pixel *refMain, int
> dirMode)
> +{
> +    Vec8s row11, row12, row21, row22, row31, row32, row41, row42;
> +    Vec16uc tmp16_1, tmp16_2;
> +    Vec2uq tmp2uq;
> +    Vec8s v_deltaFract, v_deltaPos(0), thirty2(32), thirty1(31),
> v_ipAngle(0);
> +    bool modeHor = (dirMode < 18);
> +
> +    tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain + 1);
> +    row11 = extend_low(tmp16_1);    //offsets(0,1,2,3)
> +    tmp2uq = reinterpret_i(tmp16_1);
> +    tmp2uq = tmp2uq >> 8;
> +    tmp16_2 = reinterpret_i(tmp2uq);
> +    row12 = extend_low(tmp16_2);    //offsets(1,2,3,4)
> +    row21 = row11;                  //offsets(0,1,2,3)
> +    row22 = row12;
> +    row31 = row11;
> +    row32 = row12;
> +    row41 = row11;
> +    row42 = row12;
> +
> +    v_deltaPos = v_ipAngle = 2;
> +
> +    v_deltaFract = v_deltaPos & thirty1;
> +    row11 = ((thirty2 - v_deltaFract) * row11 + (v_deltaFract * row12) +
> 16) >> 5;
> +
> +    //row2
> +    v_deltaPos += v_ipAngle;
> +    v_deltaFract = v_deltaPos & thirty1;
> +    row21 = ((thirty2 - v_deltaFract) * row21 + (v_deltaFract * row22) +
> 16) >> 5;
> +
> +    //row3
> +    v_deltaPos += v_ipAngle;
> +    v_deltaFract = v_deltaPos & thirty1;
> +    row31 = ((thirty2 - v_deltaFract) * row31 + (v_deltaFract * row32) +
> 16) >> 5;
> +
> +    //row4
> +    v_deltaPos += v_ipAngle;
> +    v_deltaFract = v_deltaPos & thirty1;
> +    row41 = ((thirty2 - v_deltaFract) * row41 + (v_deltaFract * row42) +
> 16) >> 5;
> +
> +    // Flip the block
> +    if (modeHor)
> +    {
> +        Vec8s tmp1, tmp2, tmp3, tmp4;
> +
> +        tmp1 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(row11, row31);
> +        tmp2 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(row21, row41);
> +
> +        tmp3 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(tmp1, tmp2);
> +        tmp4 = blend8s<4, 12, 5, 13, 6, 14, 7, 15>(tmp1, tmp2);
> +
> +        tmp16_1 = compress_unsafe(tmp3, tmp3);
> +        store_partial(const_int(4), pDst, tmp16_1);
> +
> +        tmp2uq = reinterpret_i(tmp16_1);
> +        tmp2uq >>= 32;
> +        store_partial(const_int(4), pDst + dstStride, tmp2uq);
> +
> +        tmp16_1 = compress_unsafe(tmp4, tmp4);
> +        store_partial(const_int(4), pDst + (2 * dstStride), tmp16_1);
> +
> +        tmp2uq = reinterpret_i(tmp16_1);
> +        tmp2uq >>= 32;
> +        store_partial(const_int(4), pDst + (3 * dstStride), tmp2uq);
> +    }
> +    else
> +    {
> +        store_partial(const_int(4), pDst, compress_unsafe(row11, row11));
> +        store_partial(const_int(4), pDst + (dstStride),
> compress_unsafe(row21, row21));
> +        store_partial(const_int(4), pDst + (2 * dstStride),
> compress_unsafe(row31, row31));
> +        store_partial(const_int(4), pDst + (3 * dstStride),
> compress_unsafe(row41, row41));
> +    }
> +}
> +
> +void PredIntraAng4_m_2(pixel* pDst, int dstStride, pixel *refMain, int
> dirMode)
> +{
> +    Vec8s row11, row12, row21, row22, row31, row32, row41, row42;
> +    Vec16uc tmp16_1, tmp16_2;
> +    Vec2uq tmp2uq;
> +    Vec8s v_deltaFract, v_deltaPos(0), thirty2(32), thirty1(31),
> v_ipAngle(0);
> +    bool modeHor = (dirMode < 18);
> +
> +    tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain);
> +    row11 = extend_low(tmp16_1);    //offsets(0,1,2,3)
> +    tmp2uq = reinterpret_i(tmp16_1);
> +    tmp2uq = tmp2uq >> 8;
> +    tmp16_2 = reinterpret_i(tmp2uq);
> +    row12 = extend_low(tmp16_2);    //offsets(1,2,3,4)
> +    row21 = row11;                  //offsets(0,1,2,3)
> +    row22 = row12;
> +    row31 = row11;
> +    row32 = row12;
> +    row41 = row11;
> +    row42 = row12;
> +
> +    v_deltaPos = v_ipAngle = -2;
> +
> +    v_deltaFract = v_deltaPos & thirty1;
> +    row11 = ((thirty2 - v_deltaFract) * row11 + (v_deltaFract * row12) +
> 16) >> 5;
> +
> +    //row2
> +    v_deltaPos += v_ipAngle;
> +    v_deltaFract = v_deltaPos & thirty1;
> +    row21 = ((thirty2 - v_deltaFract) * row21 + (v_deltaFract * row22) +
> 16) >> 5;
> +
> +    //row3
> +    v_deltaPos += v_ipAngle;
> +    v_deltaFract = v_deltaPos & thirty1;
> +    row31 = ((thirty2 - v_deltaFract) * row31 + (v_deltaFract * row32) +
> 16) >> 5;
> +
> +    //row4
> +    v_deltaPos += v_ipAngle;
> +    v_deltaFract = v_deltaPos & thirty1;
> +    row41 = ((thirty2 - v_deltaFract) * row41 + (v_deltaFract * row42) +
> 16) >> 5;
> +
> +    // Flip the block
> +    if (modeHor)
> +    {
> +        Vec8s tmp1, tmp2, tmp3, tmp4;
> +
> +        tmp1 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(row11, row31);
> +        tmp2 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(row21, row41);
> +
> +        tmp3 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(tmp1, tmp2);
> +        tmp4 = blend8s<4, 12, 5, 13, 6, 14, 7, 15>(tmp1, tmp2);
> +
> +        tmp16_1 = compress_unsafe(tmp3, tmp3);
> +        store_partial(const_int(4), pDst, tmp16_1);
> +
> +        tmp2uq = reinterpret_i(tmp16_1);
> +        tmp2uq >>= 32;
> +        store_partial(const_int(4), pDst + dstStride, tmp2uq);
> +
> +        tmp16_1 = compress_unsafe(tmp4, tmp4);
> +        store_partial(const_int(4), pDst + (2 * dstStride), tmp16_1);
> +
> +        tmp2uq = reinterpret_i(tmp16_1);
> +        tmp2uq >>= 32;
> +        store_partial(const_int(4), pDst + (3 * dstStride), tmp2uq);
> +    }
> +    else
> +    {
> +        store_partial(const_int(4), pDst, compress_unsafe(row11, row11));
> +        store_partial(const_int(4), pDst + (dstStride),
> compress_unsafe(row21, row21));
> +        store_partial(const_int(4), pDst + (2 * dstStride),
> compress_unsafe(row31, row31));
> +        store_partial(const_int(4), pDst + (3 * dstStride),
> compress_unsafe(row41, row41));
> +    }
> +}
> +
> +void PredIntraAng4_m_5(pixel* pDst, int dstStride, pixel *refMain, int
> dirMode)
> +{
> +    Vec8s row11, row12, row21, row22, row31, row32, row41, row42;
> +    Vec16uc tmp16_1, tmp16_2;
> +    Vec2uq tmp2uq;
> +    Vec8s v_deltaFract, v_deltaPos(0), thirty2(32), thirty1(31),
> v_ipAngle(0);
> +    bool modeHor = (dirMode < 18);
> +
> +    tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain);
> +    row11 = extend_low(tmp16_1);    //offsets(0,1,2,3)
> +    tmp2uq = reinterpret_i(tmp16_1);
> +    tmp2uq = tmp2uq >> 8;
> +    tmp16_2 = reinterpret_i(tmp2uq);
> +    row12 = extend_low(tmp16_2);    //offsets(1,2,3,4)
> +    row21 = row11;                  //offsets(0,1,2,3)
> +    row22 = row12;
> +    row31 = row11;
> +    row32 = row12;
> +    row41 = row11;
> +    row42 = row12;
> +
> +    v_deltaPos = v_ipAngle = -5;
> +
> +    v_deltaFract = v_deltaPos & thirty1;
> +    row11 = ((thirty2 - v_deltaFract) * row11 + (v_deltaFract * row12) +
> 16) >> 5;
> +
> +    //row2
> +    v_deltaPos += v_ipAngle;
> +    v_deltaFract = v_deltaPos & thirty1;
> +    row21 = ((thirty2 - v_deltaFract) * row21 + (v_deltaFract * row22) +
> 16) >> 5;
> +
> +    //row3
> +    v_deltaPos += v_ipAngle;
> +    v_deltaFract = v_deltaPos & thirty1;
> +    row31 = ((thirty2 - v_deltaFract) * row31 + (v_deltaFract * row32) +
> 16) >> 5;
> +
> +    //row4
> +    v_deltaPos += v_ipAngle;
> +    v_deltaFract = v_deltaPos & thirty1;
> +    row41 = ((thirty2 - v_deltaFract) * row41 + (v_deltaFract * row42) +
> 16) >> 5;
> +
> +    // Flip the block
> +    if (modeHor)
> +    {
> +        Vec8s tmp1, tmp2, tmp3, tmp4;
> +
> +        tmp1 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(row11, row31);
> +        tmp2 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(row21, row41);
> +
> +        tmp3 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(tmp1, tmp2);
> +        tmp4 = blend8s<4, 12, 5, 13, 6, 14, 7, 15>(tmp1, tmp2);
> +
> +        tmp16_1 = compress_unsafe(tmp3, tmp3);
> +        store_partial(const_int(4), pDst, tmp16_1);
> +
> +        tmp2uq = reinterpret_i(tmp16_1);
> +        tmp2uq >>= 32;
> +        store_partial(const_int(4), pDst + dstStride, tmp2uq);
> +
> +        tmp16_1 = compress_unsafe(tmp4, tmp4);
> +        store_partial(const_int(4), pDst + (2 * dstStride), tmp16_1);
> +
> +        tmp2uq = reinterpret_i(tmp16_1);
> +        tmp2uq >>= 32;
> +        store_partial(const_int(4), pDst + (3 * dstStride), tmp2uq);
> +    }
> +    else
> +    {
> +        store_partial(const_int(4), pDst, compress_unsafe(row11, row11));
> +        store_partial(const_int(4), pDst + (dstStride),
> compress_unsafe(row21, row21));
> +        store_partial(const_int(4), pDst + (2 * dstStride),
> compress_unsafe(row31, row31));
> +        store_partial(const_int(4), pDst + (3 * dstStride),
> compress_unsafe(row41, row41));
> +    }
> +}
> +
> +void PredIntraAng4_m_9(pixel* pDst, int dstStride, pixel *refMain, int
> dirMode)
> +{
> +    Vec8s row11, row12, row21, row22, row31, row32, row41, row42;
> +    Vec16uc tmp16_1, tmp16_2;
> +    Vec2uq tmp2uq;
> +    Vec8s v_deltaFract, v_deltaPos(0), thirty2(32), thirty1(31),
> v_ipAngle(0);
> +    bool modeHor = (dirMode < 18);
> +
> +    tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain - 1);
> +    row41 = extend_low(tmp16_1);    //offsets(-2,-1,0,1)
> +    tmp2uq = reinterpret_i(tmp16_1);
> +    tmp2uq = tmp2uq >> 8;
> +    tmp16_2 = reinterpret_i(tmp2uq);
> +    row42 = extend_low(tmp16_2);    //offsets(-1,0,1,2)
> +
> +    row11 = row42;
> +    tmp2uq = reinterpret_i(tmp16_1);
> +    tmp2uq = tmp2uq >> 16;
> +    tmp16_2 = reinterpret_i(tmp2uq);
> +    row12 = extend_low(tmp16_2);    //offsets(-1,0,1,2)
> +
> +    row21 = row42;                  //offsets(0,1,2,3)
> +    row22 = row12;
> +    row31 = row42;
> +    row32 = row12;
> +
> +    v_deltaPos = v_ipAngle = -9;
> +
> +    v_deltaFract = v_deltaPos & thirty1;
> +    row11 = ((thirty2 - v_deltaFract) * row11 + (v_deltaFract * row12) +
> 16) >> 5;
> +
> +    //row2
> +    v_deltaPos += v_ipAngle;
> +    v_deltaFract = v_deltaPos & thirty1;
> +    row21 = ((thirty2 - v_deltaFract) * row21 + (v_deltaFract * row22) +
> 16) >> 5;
> +
> +    //row3
> +    v_deltaPos += v_ipAngle;
> +    v_deltaFract = v_deltaPos & thirty1;
> +    row31 = ((thirty2 - v_deltaFract) * row31 + (v_deltaFract * row32) +
> 16) >> 5;
> +
> +    //row4
> +    v_deltaPos += v_ipAngle;
> +    v_deltaFract = v_deltaPos & thirty1;
> +    row41 = ((thirty2 - v_deltaFract) * row41 + (v_deltaFract * row42) +
> 16) >> 5;
> +
> +    // Flip the block
> +    if (modeHor)
> +    {
> +        Vec8s tmp1, tmp2, tmp3, tmp4;
> +
> +        tmp1 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(row11, row31);
> +        tmp2 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(row21, row41);
> +
> +        tmp3 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(tmp1, tmp2);
> +        tmp4 = blend8s<4, 12, 5, 13, 6, 14, 7, 15>(tmp1, tmp2);
> +
> +        tmp16_1 = compress_unsafe(tmp3, tmp3);
> +        store_partial(const_int(4), pDst, tmp16_1);
> +
> +        tmp2uq = reinterpret_i(tmp16_1);
> +        tmp2uq >>= 32;
> +        store_partial(const_int(4), pDst + dstStride, tmp2uq);
> +
> +        tmp16_1 = compress_unsafe(tmp4, tmp4);
> +        store_partial(const_int(4), pDst + (2 * dstStride), tmp16_1);
> +
> +        tmp2uq = reinterpret_i(tmp16_1);
> +        tmp2uq >>= 32;
> +        store_partial(const_int(4), pDst + (3 * dstStride), tmp2uq);
> +    }
> +    else
> +    {
> +        store_partial(const_int(4), pDst, compress_unsafe(row11, row11));
> +        store_partial(const_int(4), pDst + (dstStride),
> compress_unsafe(row21, row21));
> +        store_partial(const_int(4), pDst + (2 * dstStride),
> compress_unsafe(row31, row31));
> +        store_partial(const_int(4), pDst + (3 * dstStride),
> compress_unsafe(row41, row41));
> +    }
> +}
> +
> +void PredIntraAng4_m_13(pixel* pDst, int dstStride, pixel *refMain, int
> dirMode)
> +{
> +    Vec8s row11, row12, row21, row22, row31, row32, row41, row42;
> +    Vec16uc tmp16_1, tmp16_2;
> +    Vec2uq tmp2uq;
> +    Vec8s v_deltaFract, v_deltaPos(0), thirty2(32), thirty1(31),
> v_ipAngle(0);
> +    bool modeHor = (dirMode < 18);
> +
> +    tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain - 1);
> +    row41 = extend_low(tmp16_1);    //offsets(-2,-1,0,1)
> +    tmp2uq = reinterpret_i(tmp16_1);
> +    tmp2uq = tmp2uq >> 8;
> +    tmp16_2 = reinterpret_i(tmp2uq);
> +    row42 = extend_low(tmp16_2);    //offsets(-1,0,1,2)
> +
> +    row11 = row42;
> +    tmp2uq = reinterpret_i(tmp16_1);
> +    tmp2uq = tmp2uq >> 16;
> +    tmp16_2 = reinterpret_i(tmp2uq);
> +    row12 = extend_low(tmp16_2);    //offsets(-1,0,1,2)
> +
> +    row21 = row42;                  //offsets(0,1,2,3)
> +    row22 = row12;
> +    row31 = row41;
> +    row32 = row42;
> +
> +    v_deltaPos = v_ipAngle = -13;
> +
> +    v_deltaFract = v_deltaPos & thirty1;
> +    row11 = ((thirty2 - v_deltaFract) * row11 + (v_deltaFract * row12) +
> 16) >> 5;
> +
> +    //row2
> +    v_deltaPos += v_ipAngle;
> +    v_deltaFract = v_deltaPos & thirty1;
> +    row21 = ((thirty2 - v_deltaFract) * row21 + (v_deltaFract * row22) +
> 16) >> 5;
> +
> +    //row3
> +    v_deltaPos += v_ipAngle;
> +    v_deltaFract = v_deltaPos & thirty1;
> +    row31 = ((thirty2 - v_deltaFract) * row31 + (v_deltaFract * row32) +
> 16) >> 5;
> +
> +    //row4
> +    v_deltaPos += v_ipAngle;
> +    v_deltaFract = v_deltaPos & thirty1;
> +    row41 = ((thirty2 - v_deltaFract) * row41 + (v_deltaFract * row42) +
> 16) >> 5;
> +
> +    // Flip the block
> +    if (modeHor)
> +    {
> +        Vec8s tmp1, tmp2, tmp3, tmp4;
> +
> +        tmp1 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(row11, row31);
> +        tmp2 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(row21, row41);
> +
> +        tmp3 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(tmp1, tmp2);
> +        tmp4 = blend8s<4, 12, 5, 13, 6, 14, 7, 15>(tmp1, tmp2);
> +
> +        tmp16_1 = compress_unsafe(tmp3, tmp3);
> +        store_partial(const_int(4), pDst, tmp16_1);
> +
> +        tmp2uq = reinterpret_i(tmp16_1);
> +        tmp2uq >>= 32;
> +        store_partial(const_int(4), pDst + dstStride, tmp2uq);
> +
> +        tmp16_1 = compress_unsafe(tmp4, tmp4);
> +        store_partial(const_int(4), pDst + (2 * dstStride), tmp16_1);
> +
> +        tmp2uq = reinterpret_i(tmp16_1);
> +        tmp2uq >>= 32;
> +        store_partial(const_int(4), pDst + (3 * dstStride), tmp2uq);
> +    }
> +    else
> +    {
> +        store_partial(const_int(4), pDst, compress_unsafe(row11, row11));
> +        store_partial(const_int(4), pDst + (dstStride),
> compress_unsafe(row21, row21));
> +        store_partial(const_int(4), pDst + (2 * dstStride),
> compress_unsafe(row31, row31));
> +        store_partial(const_int(4), pDst + (3 * dstStride),
> compress_unsafe(row41, row41));
> +    }
> +}
> +
> +void PredIntraAng4_m_17(pixel* pDst, int dstStride, pixel *refMain, int
> dirMode)
> +{
> +    Vec8s row11, row12, row21, row22, row31, row32, row41, row42;
> +    Vec16uc tmp16_1, tmp16_2;
> +    Vec2uq tmp2uq;
> +    Vec8s v_deltaFract, v_deltaPos(0), thirty2(32), thirty1(31),
> v_ipAngle(0);
> +    bool modeHor = (dirMode < 18);
> +
> +    tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain - 2);
> +    row41 = extend_low(tmp16_1);    //offsets(-3,-2,-1,0)
> +    tmp2uq = reinterpret_i(tmp16_1);
> +    tmp2uq = tmp2uq >> 8;
> +    tmp16_2 = reinterpret_i(tmp2uq);
> +    row42 = extend_low(tmp16_2);    //offsets(-2,-1,0,1)
> +
> +    row31 = row42;                  //offsets(-2,-1,0,1)
> +    tmp2uq = reinterpret_i(tmp16_1);
> +    tmp2uq = tmp2uq >> 16;
> +    tmp16_2 = reinterpret_i(tmp2uq);
> +    row32 = extend_low(tmp16_2);    //offsets(-1,0,1,2)
> +
> +    row21 = row31;                  //offsets(-2,-1,0,1)
> +    row22 = row32;
> +
> +    row11 = row32;
> +    tmp2uq = reinterpret_i(tmp16_1);
> +    tmp2uq = tmp2uq >> 24;
> +    tmp16_2 = reinterpret_i(tmp2uq);
> +    row12 = extend_low(tmp16_2);    //offsets(-1,0,1,2)
> +
> +    v_deltaPos = v_ipAngle = -17;
> +    v_deltaFract = v_deltaPos & thirty1;
> +    row11 = ((thirty2 - v_deltaFract) * row11 + (v_deltaFract * row12) +
> 16) >> 5;
> +
> +    //row2
> +    v_deltaPos += v_ipAngle;
> +    v_deltaFract = v_deltaPos & thirty1;
> +    row21 = ((thirty2 - v_deltaFract) * row21 + (v_deltaFract * row22) +
> 16) >> 5;
> +
> +    //row3
> +    v_deltaPos += v_ipAngle;
> +    v_deltaFract = v_deltaPos & thirty1;
> +    row31 = ((thirty2 - v_deltaFract) * row31 + (v_deltaFract * row32) +
> 16) >> 5;
> +
> +    //row4
> +    v_deltaPos += v_ipAngle;
> +    v_deltaFract = v_deltaPos & thirty1;
> +    row41 = ((thirty2 - v_deltaFract) * row41 + (v_deltaFract * row42) +
> 16) >> 5;
> +
> +    // Flip the block
> +    if (modeHor)
> +    {
> +        Vec8s tmp1, tmp2, tmp3, tmp4;
> +
> +        tmp1 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(row11, row31);
> +        tmp2 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(row21, row41);
> +
> +        tmp3 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(tmp1, tmp2);
> +        tmp4 = blend8s<4, 12, 5, 13, 6, 14, 7, 15>(tmp1, tmp2);
> +
> +        tmp16_1 = compress_unsafe(tmp3, tmp3);
> +        store_partial(const_int(4), pDst, tmp16_1);
> +
> +        tmp2uq = reinterpret_i(tmp16_1);
> +        tmp2uq >>= 32;
> +        store_partial(const_int(4), pDst + dstStride, tmp2uq);
> +
> +        tmp16_1 = compress_unsafe(tmp4, tmp4);
> +        store_partial(const_int(4), pDst + (2 * dstStride), tmp16_1);
> +
> +        tmp2uq = reinterpret_i(tmp16_1);
> +        tmp2uq >>= 32;
> +        store_partial(const_int(4), pDst + (3 * dstStride), tmp2uq);
> +    }
> +    else
> +    {
> +        store_partial(const_int(4), pDst, compress_unsafe(row11, row11));
> +        store_partial(const_int(4), pDst + (dstStride),
> compress_unsafe(row21, row21));
> +        store_partial(const_int(4), pDst + (2 * dstStride),
> compress_unsafe(row31, row31));
> +        store_partial(const_int(4), pDst + (3 * dstStride),
> compress_unsafe(row41, row41));
> +    }
> +}
> +
> +void PredIntraAng4_m_21(pixel* pDst, int dstStride, pixel *refMain, int
> dirMode)
> +{
> +    Vec8s row11, row12, row21, row22, row31, row32, row41, row42;
> +    Vec16uc tmp16_1, tmp16_2;
> +    Vec2uq tmp2uq;
> +    Vec8s v_deltaFract, v_deltaPos(0), thirty2(32), thirty1(31),
> v_ipAngle(0);
> +    bool modeHor = (dirMode < 18);
> +
> +    tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain - 2);
> +    row41 = extend_low(tmp16_1);    //offsets(-3,-2,-1,0)
> +    tmp2uq = reinterpret_i(tmp16_1);
> +    tmp2uq = tmp2uq >> 8;
> +    tmp16_2 = reinterpret_i(tmp2uq);
> +    row42 = extend_low(tmp16_2);    //offsets(-2,-1,0,1)
> +
> +    row31 = row42;                  //offsets(-2,-1,0,1)
> +    tmp2uq = reinterpret_i(tmp16_1);
> +    tmp2uq = tmp2uq >> 16;
> +    tmp16_2 = reinterpret_i(tmp2uq);
> +    row32 = extend_low(tmp16_2);    //offsets(-1,0,1,2)
> +
> +    row21 = row31;                  //offsets(-2,-1,0,1)
> +    row22 = row32;
> +
> +    row11 = row32;
> +    tmp2uq = reinterpret_i(tmp16_1);
> +    tmp2uq = tmp2uq >> 24;
> +    tmp16_2 = reinterpret_i(tmp2uq);
> +    row12 = extend_low(tmp16_2);    //offsets(-1,0,1,2)
> +
> +    v_deltaPos = v_ipAngle = -21;
> +
> +    v_deltaFract = v_deltaPos & thirty1;
> +    row11 = ((thirty2 - v_deltaFract) * row11 + (v_deltaFract * row12) +
> 16) >> 5;
> +
> +    //row2
> +    v_deltaPos += v_ipAngle;
> +    v_deltaFract = v_deltaPos & thirty1;
> +    row21 = ((thirty2 - v_deltaFract) * row21 + (v_deltaFract * row22) +
> 16) >> 5;
> +
> +    //row3
> +    v_deltaPos += v_ipAngle;
> +    v_deltaFract = v_deltaPos & thirty1;
> +    row31 = ((thirty2 - v_deltaFract) * row31 + (v_deltaFract * row32) +
> 16) >> 5;
> +
> +    //row4
> +    v_deltaPos += v_ipAngle;
> +    v_deltaFract = v_deltaPos & thirty1;
> +    row41 = ((thirty2 - v_deltaFract) * row41 + (v_deltaFract * row42) +
> 16) >> 5;
> +
> +    // Flip the block
> +    if (modeHor)
> +    {
> +        Vec8s tmp1, tmp2, tmp3, tmp4;
> +
> +        tmp1 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(row11, row31);
> +        tmp2 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(row21, row41);
> +
> +        tmp3 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(tmp1, tmp2);
> +        tmp4 = blend8s<4, 12, 5, 13, 6, 14, 7, 15>(tmp1, tmp2);
> +
> +        tmp16_1 = compress_unsafe(tmp3, tmp3);
> +        store_partial(const_int(4), pDst, tmp16_1);
> +
> +        tmp2uq = reinterpret_i(tmp16_1);
> +        tmp2uq >>= 32;
> +        store_partial(const_int(4), pDst + dstStride, tmp2uq);
> +
> +        tmp16_1 = compress_unsafe(tmp4, tmp4);
> +        store_partial(const_int(4), pDst + (2 * dstStride), tmp16_1);
> +
> +        tmp2uq = reinterpret_i(tmp16_1);
> +        tmp2uq >>= 32;
> +        store_partial(const_int(4), pDst + (3 * dstStride), tmp2uq);
> +    }
> +    else
> +    {
> +        store_partial(const_int(4), pDst, compress_unsafe(row11, row11));
> +        store_partial(const_int(4), pDst + (dstStride),
> compress_unsafe(row21, row21));
> +        store_partial(const_int(4), pDst + (2 * dstStride),
> compress_unsafe(row31, row31));
> +        store_partial(const_int(4), pDst + (3 * dstStride),
> compress_unsafe(row41, row41));
> +    }
> +}
> +
> +void PredIntraAng4_m_26(pixel* pDst, int dstStride, pixel *refMain, int
> dirMode)
> +{
> +    Vec8s row11, row12, row21, row22, row31, row32, row41, row42;
> +    Vec16uc tmp16_1, tmp16_2;
> +    Vec2uq tmp2uq;
> +    Vec8s v_deltaFract, v_deltaPos(0), thirty2(32), thirty1(31),
> v_ipAngle(0);
> +    bool modeHor = (dirMode < 18);
> +
> +    tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain - 3);
> +    row41 = extend_low(tmp16_1);    //offsets(-4,-3,-2,-1)
> +    tmp2uq = reinterpret_i(tmp16_1);
> +    tmp2uq = tmp2uq >> 8;
> +    tmp16_2 = reinterpret_i(tmp2uq);
> +    row42 = extend_low(tmp16_2);    //offsets(-3,-2,-1,0)
> +
> +    row31 = row42;                  //offsets(-3,-2,-1,0)
> +    tmp2uq = reinterpret_i(tmp16_1);
> +    tmp2uq = tmp2uq >> 16;
> +    tmp16_2 = reinterpret_i(tmp2uq);
> +    row32 = extend_low(tmp16_2);    //offsets(-2,-1,0,1)
> +
> +    row21 = row32;                  //offsets(-2,-1,0,1)
> +    tmp2uq = reinterpret_i(tmp16_1);
> +    tmp2uq = tmp2uq >> 24;
> +    tmp16_2 = reinterpret_i(tmp2uq);
> +    row22 = extend_low(tmp16_2);    //offsets(-1,0,1,2)
> +
> +    row11 = row22;                  //offsets(-1,0,1,2)
> +    tmp2uq = reinterpret_i(tmp16_1);
> +    tmp2uq = tmp2uq >> 32;
> +    tmp16_2 = reinterpret_i(tmp2uq);
> +    row12 = extend_low(tmp16_2);    //offsets(0,1,2,3)
> +
> +    v_deltaPos = v_ipAngle = -26;
> +
> +    v_deltaFract = v_deltaPos & thirty1;
> +    row11 = ((thirty2 - v_deltaFract) * row11 + (v_deltaFract * row12) +
> 16) >> 5;
> +
> +    //row2
> +    v_deltaPos += v_ipAngle;
> +    v_deltaFract = v_deltaPos & thirty1;
> +    row21 = ((thirty2 - v_deltaFract) * row21 + (v_deltaFract * row22) +
> 16) >> 5;
> +
> +    //row3
> +    v_deltaPos += v_ipAngle;
> +    v_deltaFract = v_deltaPos & thirty1;
> +    row31 = ((thirty2 - v_deltaFract) * row31 + (v_deltaFract * row32) +
> 16) >> 5;
> +
> +    //row4
> +    v_deltaPos += v_ipAngle;
> +    v_deltaFract = v_deltaPos & thirty1;
> +    row41 = ((thirty2 - v_deltaFract) * row41 + (v_deltaFract * row42) +
> 16) >> 5;
> +
> +    // Flip the block
> +    if (modeHor)
> +    {
> +        Vec8s tmp1, tmp2, tmp3, tmp4;
> +
> +        tmp1 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(row11, row31);
> +        tmp2 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(row21, row41);
> +
> +        tmp3 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(tmp1, tmp2);
> +        tmp4 = blend8s<4, 12, 5, 13, 6, 14, 7, 15>(tmp1, tmp2);
> +
> +        tmp16_1 = compress_unsafe(tmp3, tmp3);
> +        store_partial(const_int(4), pDst, tmp16_1);
> +
> +        tmp2uq = reinterpret_i(tmp16_1);
> +        tmp2uq >>= 32;
> +        store_partial(const_int(4), pDst + dstStride, tmp2uq);
> +
> +        tmp16_1 = compress_unsafe(tmp4, tmp4);
> +        store_partial(const_int(4), pDst + (2 * dstStride), tmp16_1);
> +
> +        tmp2uq = reinterpret_i(tmp16_1);
> +        tmp2uq >>= 32;
> +        store_partial(const_int(4), pDst + (3 * dstStride), tmp2uq);
> +    }
> +    else
> +    {
> +        store_partial(const_int(4), pDst, compress_unsafe(row11, row11));
> +        store_partial(const_int(4), pDst + (dstStride),
> compress_unsafe(row21, row21));
> +        store_partial(const_int(4), pDst + (2 * dstStride),
> compress_unsafe(row31, row31));
> +        store_partial(const_int(4), pDst + (3 * dstStride),
> compress_unsafe(row41, row41));
> +    }
> +}
> +
> +void PredIntraAng4_m_32(pixel* pDst, int dstStride, pixel *refMain, int
> dirMode)
> +{
> +    Vec16uc tmp16_1, tmp16_2;
> +    dirMode++;
> +    tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain);    //-1,0,1,2
> +    store_partial(const_int(4), pDst, tmp16_1);
> +    tmp16_2 = (Vec16uc)load_partial(const_int(8), refMain - 1);
> //-2,-1,0,1
> +    store_partial(const_int(4), pDst + dstStride, tmp16_2);
> +    tmp16_2 = (Vec16uc)load_partial(const_int(8), refMain - 2);
> +    store_partial(const_int(4), pDst + 2 * dstStride, tmp16_2);
> +    tmp16_2 = (Vec16uc)load_partial(const_int(8), refMain - 3);
> +    store_partial(const_int(4), pDst + 3 * dstStride, tmp16_2);
> +}
> +
> +typedef void (*PredIntraAng4x4_table)(pixel* pDst, int dstStride, pixel
> *refMain, int dirMode);
> +PredIntraAng4x4_table PredIntraAng[] = {
> +    /* PredIntraAng4_0 is replaced with PredIntraAng4_2. For
> PredIntraAng4_0 we are going through default path in the xPredIntraAng4x4
> because we cannot afford to pass large number arguments for this function.
> */
> +    PredIntraAng4_32,
> +    PredIntraAng4_26,
> +    PredIntraAng4_21,
> +    PredIntraAng4_17,
> +    PredIntraAng4_13,
> +    PredIntraAng4_9,
> +    PredIntraAng4_5,
> +    PredIntraAng4_2,
> +    PredIntraAng4_2,    //Intentionally wrong! It should be
> "PredIntraAng4_0" here.
> +    PredIntraAng4_m_2,
> +    PredIntraAng4_m_5,
> +    PredIntraAng4_m_9,
> +    PredIntraAng4_m_13,
> +    PredIntraAng4_m_17,
> +    PredIntraAng4_m_21,
> +    PredIntraAng4_m_26,
> +    PredIntraAng4_m_32,
> +    PredIntraAng4_m_26,
> +    PredIntraAng4_m_21,
> +    PredIntraAng4_m_17,
> +    PredIntraAng4_m_13,
> +    PredIntraAng4_m_9,
> +    PredIntraAng4_m_5,
> +    PredIntraAng4_m_2,
> +    PredIntraAng4_2,    //Intentionally wrong! It should be
> "PredIntraAng4_0" here.
> +    PredIntraAng4_2,
> +    PredIntraAng4_5,
> +    PredIntraAng4_9,
> +    PredIntraAng4_13,
> +    PredIntraAng4_17,
> +    PredIntraAng4_21,
> +    PredIntraAng4_26,
> +    PredIntraAng4_32
> +};
>  void xPredIntraAng4x4(int /*bitDepth*/, pixel* pDst, int dstStride, int
> width, int dirMode, pixel *refLeft, pixel *refAbove, bool bFilter = true)
>  {
> -    int blkSize        = width;
> -
> -    // Map the mode index to main prediction direction and angle
>      assert(dirMode > 1); //no planar and dc
> +    int mode_to_angle_table[] = {32, 26, 21, 17, 13, 9, 5, 2, 0, -2, -5,
> -9, -13, -17, -21, -26, -32, -26, -21, -17, -13, -9, -5, -2, 0, 2, 5, 9,
> 13, 17, 21, 26, 32};
> +    int mode_to_invAng_table[] = {256, 315, 390, 482, 630, 910, 1638,
> 4096, 0, 4096, 1638, 910, 630, 482, 390, 315, 256, 315, 390, 482, 630, 910,
> 1638, 4096, 0, 4096, 1638, 910, 630, 482, 390, 315, 256};
> +    int intraPredAngle = mode_to_angle_table[dirMode-2];
> +    int invAngle       = mode_to_invAng_table[dirMode-2];
> +
>      bool modeHor       = (dirMode < 18);
>      bool modeVer       = !modeHor;
> -    int intraPredAngle = modeVer ? (int)dirMode - VER_IDX : modeHor ?
> -((int)dirMode - HOR_IDX) : 0;
> -    int absAng         = abs(intraPredAngle);
> -    int signAng        = intraPredAngle < 0 ? -1 : 1;
> -
> -    // Set bitshifts and scale the angle parameter to block size
> -    int angTable[9]    = { 0,    2,    5,   9,  13,  17,  21,  26,  32 };
> -    int invAngTable[9] = { 0, 4096, 1638, 910, 630, 482, 390, 315, 256 };
> // (256 * 32) / Angle
> -    int invAngle       = invAngTable[absAng];
> -    absAng             = angTable[absAng];
> -    intraPredAngle     = signAng * absAng;
>
>      // Do angular predictions
> -
>      pixel* refMain;
>      pixel* refSide;
>
>      // Initialise the Main and Left reference array.
>      if (intraPredAngle < 0)
>      {
> +        int blkSize = width;
>          refMain = (modeVer ? refAbove : refLeft);     // + (blkSize - 1);
>          refSide = (modeVer ? refLeft : refAbove);     // + (blkSize - 1);
>
> @@ -1785,415 +2981,7 @@
>      }
>      else
>      {
> -        Vec8s row11, row12, row21, row22, row31, row32, row41, row42;
> -        Vec16uc tmp16_1, tmp16_2;
> -        Vec2uq tmp2uq;
> -        Vec8s v_deltaFract, v_deltaPos(0), thirty2(32), thirty1(31),
> v_ipAngle(0);
> -        switch (intraPredAngle)
> -        {
> -        case -32:
> -            tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain);
>  //-1,0,1,2
> -            store_partial(const_int(4), pDst, tmp16_1);
> -            tmp16_2 = (Vec16uc)load_partial(const_int(8), refMain - 1);
> //-2,-1,0,1
> -            store_partial(const_int(4), pDst + dstStride, tmp16_2);
> -            tmp16_2 = (Vec16uc)load_partial(const_int(8), refMain - 2);
> -            store_partial(const_int(4), pDst + 2 * dstStride, tmp16_2);
> -            tmp16_2 = (Vec16uc)load_partial(const_int(8), refMain - 3);
> -            store_partial(const_int(4), pDst + 3 * dstStride, tmp16_2);
> -            return;
> -
> -        case -26:
> -            tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain - 3);
> -            row41 = extend_low(tmp16_1);    //offsets(-4,-3,-2,-1)
> -            tmp2uq = reinterpret_i(tmp16_1);
> -            tmp2uq = tmp2uq >> 8;
> -            tmp16_2 = reinterpret_i(tmp2uq);
> -            row42 = extend_low(tmp16_2);    //offsets(-3,-2,-1,0)
> -
> -            row31 = row42;                  //offsets(-3,-2,-1,0)
> -            tmp2uq = reinterpret_i(tmp16_1);
> -            tmp2uq = tmp2uq >> 16;
> -            tmp16_2 = reinterpret_i(tmp2uq);
> -            row32 = extend_low(tmp16_2);    //offsets(-2,-1,0,1)
> -
> -            row21 = row32;                  //offsets(-2,-1,0,1)
> -            tmp2uq = reinterpret_i(tmp16_1);
> -            tmp2uq = tmp2uq >> 24;
> -            tmp16_2 = reinterpret_i(tmp2uq);
> -            row22 = extend_low(tmp16_2);    //offsets(-1,0,1,2)
> -
> -            row11 = row22;                  //offsets(-1,0,1,2)
> -            tmp2uq = reinterpret_i(tmp16_1);
> -            tmp2uq = tmp2uq >> 32;
> -            tmp16_2 = reinterpret_i(tmp2uq);
> -            row12 = extend_low(tmp16_2);    //offsets(0,1,2,3)
> -
> -            v_deltaPos = v_ipAngle = -26;
> -            break;
> -
> -        case -21:
> -            tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain - 2);
> -            row41 = extend_low(tmp16_1);    //offsets(-3,-2,-1,0)
> -            tmp2uq = reinterpret_i(tmp16_1);
> -            tmp2uq = tmp2uq >> 8;
> -            tmp16_2 = reinterpret_i(tmp2uq);
> -            row42 = extend_low(tmp16_2);    //offsets(-2,-1,0,1)
> -
> -            row31 = row42;                  //offsets(-2,-1,0,1)
> -            tmp2uq = reinterpret_i(tmp16_1);
> -            tmp2uq = tmp2uq >> 16;
> -            tmp16_2 = reinterpret_i(tmp2uq);
> -            row32 = extend_low(tmp16_2);    //offsets(-1,0,1,2)
> -
> -            row21 = row31;                  //offsets(-2,-1,0,1)
> -            row22 = row32;
> -
> -            row11 = row32;
> -            tmp2uq = reinterpret_i(tmp16_1);
> -            tmp2uq = tmp2uq >> 24;
> -            tmp16_2 = reinterpret_i(tmp2uq);
> -            row12 = extend_low(tmp16_2);    //offsets(-1,0,1,2)
> -
> -            v_deltaPos = v_ipAngle = -21;
> -            break;
> -
> -        case -17:
> -            tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain - 2);
> -            row41 = extend_low(tmp16_1);    //offsets(-3,-2,-1,0)
> -            tmp2uq = reinterpret_i(tmp16_1);
> -            tmp2uq = tmp2uq >> 8;
> -            tmp16_2 = reinterpret_i(tmp2uq);
> -            row42 = extend_low(tmp16_2);    //offsets(-2,-1,0,1)
> -
> -            row31 = row42;                  //offsets(-2,-1,0,1)
> -            tmp2uq = reinterpret_i(tmp16_1);
> -            tmp2uq = tmp2uq >> 16;
> -            tmp16_2 = reinterpret_i(tmp2uq);
> -            row32 = extend_low(tmp16_2);    //offsets(-1,0,1,2)
> -
> -            row21 = row31;                  //offsets(-2,-1,0,1)
> -            row22 = row32;
> -
> -            row11 = row32;
> -            tmp2uq = reinterpret_i(tmp16_1);
> -            tmp2uq = tmp2uq >> 24;
> -            tmp16_2 = reinterpret_i(tmp2uq);
> -            row12 = extend_low(tmp16_2);    //offsets(-1,0,1,2)
> -
> -            v_deltaPos = v_ipAngle = -17;
> -            break;
> -
> -        case -13:
> -            tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain - 1);
> -            row41 = extend_low(tmp16_1);    //offsets(-2,-1,0,1)
> -            tmp2uq = reinterpret_i(tmp16_1);
> -            tmp2uq = tmp2uq >> 8;
> -            tmp16_2 = reinterpret_i(tmp2uq);
> -            row42 = extend_low(tmp16_2);    //offsets(-1,0,1,2)
> -
> -            row11 = row42;
> -            tmp2uq = reinterpret_i(tmp16_1);
> -            tmp2uq = tmp2uq >> 16;
> -            tmp16_2 = reinterpret_i(tmp2uq);
> -            row12 = extend_low(tmp16_2);    //offsets(-1,0,1,2)
> -
> -            row21 = row42;                  //offsets(0,1,2,3)
> -            row22 = row12;
> -            row31 = row41;
> -            row32 = row42;
> -
> -            v_deltaPos = v_ipAngle = -13;
> -            break;
> -
> -        case -9:
> -            tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain - 1);
> -            row41 = extend_low(tmp16_1);    //offsets(-2,-1,0,1)
> -            tmp2uq = reinterpret_i(tmp16_1);
> -            tmp2uq = tmp2uq >> 8;
> -            tmp16_2 = reinterpret_i(tmp2uq);
> -            row42 = extend_low(tmp16_2);    //offsets(-1,0,1,2)
> -
> -            row11 = row42;
> -            tmp2uq = reinterpret_i(tmp16_1);
> -            tmp2uq = tmp2uq >> 16;
> -            tmp16_2 = reinterpret_i(tmp2uq);
> -            row12 = extend_low(tmp16_2);    //offsets(-1,0,1,2)
> -
> -            row21 = row42;                  //offsets(0,1,2,3)
> -            row22 = row12;
> -            row31 = row42;
> -            row32 = row12;
> -
> -            v_deltaPos = v_ipAngle = -9;
> -            break;
> -
> -        case -5:
> -            tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain);
> -            row11 = extend_low(tmp16_1);    //offsets(0,1,2,3)
> -            tmp2uq = reinterpret_i(tmp16_1);
> -            tmp2uq = tmp2uq >> 8;
> -            tmp16_2 = reinterpret_i(tmp2uq);
> -            row12 = extend_low(tmp16_2);    //offsets(1,2,3,4)
> -            row21 = row11;                  //offsets(0,1,2,3)
> -            row22 = row12;
> -            row31 = row11;
> -            row32 = row12;
> -            row41 = row11;
> -            row42 = row12;
> -
> -            v_deltaPos = v_ipAngle = -5;
> -            break;
> -
> -        case -2:
> -            tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain);
> -            row11 = extend_low(tmp16_1);    //offsets(0,1,2,3)
> -            tmp2uq = reinterpret_i(tmp16_1);
> -            tmp2uq = tmp2uq >> 8;
> -            tmp16_2 = reinterpret_i(tmp2uq);
> -            row12 = extend_low(tmp16_2);    //offsets(1,2,3,4)
> -            row21 = row11;                  //offsets(0,1,2,3)
> -            row22 = row12;
> -            row31 = row11;
> -            row32 = row12;
> -            row41 = row11;
> -            row42 = row12;
> -
> -            v_deltaPos = v_ipAngle = -2;
> -            break;
> -
> -        case 2:
> -            tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain + 1);
> -            row11 = extend_low(tmp16_1);    //offsets(0,1,2,3)
> -            tmp2uq = reinterpret_i(tmp16_1);
> -            tmp2uq = tmp2uq >> 8;
> -            tmp16_2 = reinterpret_i(tmp2uq);
> -            row12 = extend_low(tmp16_2);    //offsets(1,2,3,4)
> -            row21 = row11;                  //offsets(0,1,2,3)
> -            row22 = row12;
> -            row31 = row11;
> -            row32 = row12;
> -            row41 = row11;
> -            row42 = row12;
> -
> -            v_deltaPos = v_ipAngle = 2;
> -            break;
> -
> -        case 5:
> -            tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain + 1);
> -            row11 = extend_low(tmp16_1);    //offsets(0,1,2,3)
> -            tmp2uq = reinterpret_i(tmp16_1);
> -            tmp2uq = tmp2uq >> 8;
> -            tmp16_2 = reinterpret_i(tmp2uq);
> -            row12 = extend_low(tmp16_2);    //offsets(1,2,3,4)
> -            row21 = row11;                  //offsets(0,1,2,3)
> -            row22 = row12;
> -            row31 = row11;
> -            row32 = row12;
> -            row41 = row11;
> -            row42 = row12;
> -
> -            v_deltaPos = v_ipAngle = 5;
> -            break;
> -
> -        case 9:
> -            tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain + 1);
> -            row11 = extend_low(tmp16_1);    //offsets(0,1,2,3)
> -            tmp2uq = reinterpret_i(tmp16_1);
> -            tmp2uq = tmp2uq >> 8;
> -            tmp16_2 = reinterpret_i(tmp2uq);
> -            row12 = extend_low(tmp16_2);    //offsets(1,2,3,4)
> -            row21 = row11;                  //offsets(0,1,2,3)
> -            row22 = row12;
> -            row31 = row11;
> -            row32 = row12;
> -            row41 = row12;
> -            tmp2uq = reinterpret_i(tmp16_1);
> -            tmp2uq = tmp2uq >> 16;
> -            tmp16_2 = reinterpret_i(tmp2uq);
> -            row42 = extend_low(tmp16_2);
> -
> -            v_deltaPos = v_ipAngle = 9;
> -            break;
> -
> -        case 13:
> -            tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain + 1);
> -
> -            row11 = extend_low(tmp16_1);    //offsets(0,1,2,3)
> -
> -            tmp2uq = reinterpret_i(tmp16_1);
> -            tmp2uq = tmp2uq >> 8;
> -            tmp16_2 = reinterpret_i(tmp2uq);
> -            row12 = extend_low(tmp16_2);    //offsets(1,2,3,4)
> -
> -            row21 = row11;                  //offsets(0,1,2,3)
> -            row22 = row12;
> -            row31 = row12;                  //offsets(1,2,3,4)
> -
> -            tmp2uq = reinterpret_i(tmp16_1);
> -            tmp2uq = tmp2uq >> 16;
> -            tmp16_2 = reinterpret_i(tmp2uq);
> -            row32 = extend_low(tmp16_2);    //offsets(2,3,4,5)
> -
> -            row41 = row31;                  //offsets(1,2,3,4)
> -            row42 = row32;
> -
> -            v_deltaPos = v_ipAngle = 13;
> -            break;
> -
> -        case 17:
> -            tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain + 1);
> -
> -            row11 = extend_low(tmp16_1);    //offsets(0,1,2,3)
> -
> -            tmp2uq = reinterpret_i(tmp16_1);
> -            tmp2uq = tmp2uq >> 8;
> -            tmp16_2 = reinterpret_i(tmp2uq);
> -            row12 = extend_low(tmp16_2);    //offsets(1,2,3,4)
> -
> -            row21 = row12;
> -
> -            tmp2uq = reinterpret_i(tmp16_1);
> -            tmp2uq = tmp2uq >> 16;
> -            tmp16_2 = reinterpret_i(tmp2uq);
> -            row22 = extend_low(tmp16_2);    //offsets(2,3,4,5)
> -
> -            row31 = row21;
> -            row32 = row22;
> -
> -            row41 = row22;
> -            tmp2uq = reinterpret_i(tmp16_1);
> -            tmp2uq = tmp2uq >> 24;
> -            tmp16_2 = reinterpret_i(tmp2uq);
> -            row42 = extend_low(tmp16_2);    //offsets(3,4,5,6)
> -
> -            v_deltaPos = v_ipAngle = 17;
> -            break;
> -
> -        case 21:
> -            tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain + 1);
> -
> -            row11 = extend_low(tmp16_1);    //offsets(0,1,2,3)
> -
> -            tmp2uq = reinterpret_i(tmp16_1);
> -            tmp2uq = tmp2uq >> 8;
> -            tmp16_2 = reinterpret_i(tmp2uq);
> -            row12 = extend_low(tmp16_2);    //offsets(1,2,3,4)
> -
> -            row21 = row12;
> -
> -            tmp2uq = reinterpret_i(tmp16_1);
> -            tmp2uq = tmp2uq >> 16;
> -            tmp16_2 = reinterpret_i(tmp2uq);
> -            row22 = extend_low(tmp16_2);    //offsets(2,3,4,5)
> -
> -            row31 = row21;
> -            row32 = row22;
> -
> -            row41 = row22;
> -            tmp2uq = reinterpret_i(tmp16_1);
> -            tmp2uq = tmp2uq >> 24;
> -            tmp16_2 = reinterpret_i(tmp2uq);
> -            row42 = extend_low(tmp16_2);    //offsets(3,4,5,6)
> -
> -            v_deltaPos = v_ipAngle = 21;
> -            break;
> -
> -        case 26:
> -            tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain + 1);
> -
> -            row11 = extend_low(tmp16_1);    //offsets(0,1,2,3)
> -
> -            tmp2uq = reinterpret_i(tmp16_1);
> -            tmp2uq = tmp2uq >> 8;
> -            tmp16_2 = reinterpret_i(tmp2uq);
> -            row12 = extend_low(tmp16_2);    //offsets(1,2,3,4)
> -
> -            row21 = row12;
> -
> -            tmp2uq = reinterpret_i(tmp16_1);
> -            tmp2uq = tmp2uq >> 16;
> -            tmp16_2 = reinterpret_i(tmp2uq);
> -            row22 = extend_low(tmp16_2);    //offsets(2,3,4,5)
> -
> -            row31 = row22;
> -            tmp2uq = reinterpret_i(tmp16_1);
> -            tmp2uq = tmp2uq >> 24;
> -            tmp16_2 = reinterpret_i(tmp2uq);
> -            row32 = extend_low(tmp16_2);    //offsets(3,4,5,6)
> -
> -            row41 = row32;
> -            tmp2uq = reinterpret_i(tmp16_1);
> -            tmp2uq = tmp2uq >> 32;
> -            tmp16_2 = reinterpret_i(tmp2uq);
> -            row42 = extend_low(tmp16_2);    //offsets(4,5,6,7)
> -
> -            v_deltaPos = v_ipAngle = 26;
> -            break;
> -
> -        case 32:
> -            tmp16_1 = (Vec16uc)load_partial(const_int(8), refMain + 2);
> -            store_partial(const_int(4), pDst, tmp16_1);
> -            tmp16_2 = (Vec16uc)load_partial(const_int(8), refMain + 3);
> -            store_partial(const_int(4), pDst + dstStride, tmp16_2);
> -            tmp16_2 = (Vec16uc)load_partial(const_int(8), refMain + 4);
> -            store_partial(const_int(4), pDst + 2 * dstStride, tmp16_2);
> -            tmp16_2 = (Vec16uc)load_partial(const_int(8), refMain + 5);
> -            store_partial(const_int(4), pDst + 3 * dstStride, tmp16_2);
> -            return;
> -        }
> -
> -        //row1
> -        v_deltaFract = v_deltaPos & thirty1;
> -        row11 = ((thirty2 - v_deltaFract) * row11 + (v_deltaFract *
> row12) + 16) >> 5;
> -
> -        //row2
> -        v_deltaPos += v_ipAngle;
> -        v_deltaFract = v_deltaPos & thirty1;
> -        row21 = ((thirty2 - v_deltaFract) * row21 + (v_deltaFract *
> row22) + 16) >> 5;
> -
> -        //row3
> -        v_deltaPos += v_ipAngle;
> -        v_deltaFract = v_deltaPos & thirty1;
> -        row31 = ((thirty2 - v_deltaFract) * row31 + (v_deltaFract *
> row32) + 16) >> 5;
> -
> -        //row4
> -        v_deltaPos += v_ipAngle;
> -        v_deltaFract = v_deltaPos & thirty1;
> -        row41 = ((thirty2 - v_deltaFract) * row41 + (v_deltaFract *
> row42) + 16) >> 5;
> -
> -        // Flip the block
> -
> -        if (modeHor)
> -        {
> -            Vec8s tmp1, tmp2, tmp3, tmp4;
> -
> -            tmp1 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(row11, row31);
> -            tmp2 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(row21, row41);
> -
> -            tmp3 = blend8s<0, 8, 1, 9, 2, 10, 3, 11>(tmp1, tmp2);
> -            tmp4 = blend8s<4, 12, 5, 13, 6, 14, 7, 15>(tmp1, tmp2);
> -
> -            tmp16_1 = compress_unsafe(tmp3, tmp3);
> -            store_partial(const_int(4), pDst, tmp16_1);
> -
> -            tmp2uq = reinterpret_i(tmp16_1);
> -            tmp2uq >>= 32;
> -            store_partial(const_int(4), pDst + dstStride, tmp2uq);
> -
> -            tmp16_1 = compress_unsafe(tmp4, tmp4);
> -            store_partial(const_int(4), pDst + (2 * dstStride), tmp16_1);
> -
> -            tmp2uq = reinterpret_i(tmp16_1);
> -            tmp2uq >>= 32;
> -            store_partial(const_int(4), pDst + (3 * dstStride), tmp2uq);
> -        }
> -        else
> -        {
> -            store_partial(const_int(4), pDst, compress_unsafe(row11,
> row11));
> -            store_partial(const_int(4), pDst + (dstStride),
> compress_unsafe(row21, row21));
> -            store_partial(const_int(4), pDst + (2 * dstStride),
> compress_unsafe(row31, row31));
> -            store_partial(const_int(4), pDst + (3 * dstStride),
> compress_unsafe(row41, row41));
> -        }
> +        PredIntraAng[dirMode-2](pDst, dstStride, refMain, dirMode);
>      }
>  }
>
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> http://mailman.videolan.org/listinfo/x265-devel
>
>


-- 
Steve Borho
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20130626/fdcc8a14/attachment-0001.html>


More information about the x265-devel mailing list