[x265] [PATCH] xIDCT32 code cleanup

Steve Borho steve at borho.org
Sat Jul 6 21:37:41 CEST 2013


On Fri, Jul 5, 2013 at 4:50 AM, <praveen at multicorewareinc.com> wrote:

> # HG changeset patch
> # User praveentiwari
> # Date 1373017843 -19800
> # Node ID e3e4ed95a58af4e69a7494b2ecddcec0e07b3513
> # Parent  21934e9f52130a7dec6ea3f7a96ad99d023d61c3
> xIDCT32 code cleanup
>
> diff -r 21934e9f5213 -r e3e4ed95a58a source/common/vec/dct.inc
> --- a/source/common/vec/dct.inc Fri Jul 05 15:14:04 2013 +0530
> +++ b/source/common/vec/dct.inc Fri Jul 05 15:20:43 2013 +0530
> @@ -3100,7 +3100,7 @@
>      _mm_store_si128((__m128i*)&dst[15 * stride + 8], in15[1]);
>  }
>
> -void xIDCT32(int *pSrc, short *pDst, intptr_t stride)
> +void xIDCT32(int *src, short *dst, intptr_t stride)
>  {
>      //Odd
>      const __m128i c16_p90_p90   = _mm_set1_epi32(0x005A005A); //column 0
> @@ -3282,7 +3282,7 @@
>      const __m128i c16_p64_p64   = _mm_set1_epi32(0x00400040);
>      __m128i c32_rnd             = _mm_set1_epi32(64);
>
> -    Int nShift = 7;
> +    int nShift = 7;
>

nShift should be shift


>      // DCT1
>      __m128i in00[4], in01[4], in02[4], in03[4], in04[4], in05[4],
> in06[4], in07[4], in08[4], in09[4], in10[4], in11[4], in12[4], in13[4],
> in14[4], in15[4];
> @@ -3290,141 +3290,141 @@
>      __m128i res00[4], res01[4], res02[4], res03[4], res04[4], res05[4],
> res06[4], res07[4], res08[4], res09[4], res10[4], res11[4], res12[4],
> res13[4], res14[4], res15[4];
>      __m128i res16[4], res17[4], res18[4], res19[4], res20[4], res21[4],
> res22[4], res23[4], res24[4], res25[4], res26[4], res27[4], res28[4],
> res29[4], res30[4], res31[4];
>
> -    for (Int i = 0; i < 4; i++)
> +    for (int i = 0; i < 4; i++)
>      {
>          const int offset = (i << 3);
>          __m128i T00, T01;
>
> -        T00 = _mm_loadu_si128((const __m128i*)&pSrc[0 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&pSrc[0 * 32 + offset + 4]);
> +        T00 = _mm_loadu_si128((const __m128i*)&src[0 * 32 + offset]);
> +        T01 = _mm_loadu_si128((const __m128i*)&src[0 * 32 + offset + 4]);
>          in00[i]  = _mm_packs_epi32(T00, T01);
>
> -        T00 = _mm_loadu_si128((const __m128i*)&pSrc[1 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&pSrc[1 * 32 + offset + 4]);
> +        T00 = _mm_loadu_si128((const __m128i*)&src[1 * 32 + offset]);
> +        T01 = _mm_loadu_si128((const __m128i*)&src[1 * 32 + offset + 4]);
>          in01[i]  = _mm_packs_epi32(T00, T01);
>
> -        T00 = _mm_loadu_si128((const __m128i*)&pSrc[2 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&pSrc[2 * 32 + offset + 4]);
> +        T00 = _mm_loadu_si128((const __m128i*)&src[2 * 32 + offset]);
> +        T01 = _mm_loadu_si128((const __m128i*)&src[2 * 32 + offset + 4]);
>          in02[i]  = _mm_packs_epi32(T00, T01);
>
> -        T00 = _mm_loadu_si128((const __m128i*)&pSrc[3 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&pSrc[3 * 32 + offset + 4]);
> +        T00 = _mm_loadu_si128((const __m128i*)&src[3 * 32 + offset]);
> +        T01 = _mm_loadu_si128((const __m128i*)&src[3 * 32 + offset + 4]);
>          in03[i]  = _mm_packs_epi32(T00, T01);
>
> -        T00 = _mm_loadu_si128((const __m128i*)&pSrc[4 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&pSrc[4 * 32 + offset + 4]);
> +        T00 = _mm_loadu_si128((const __m128i*)&src[4 * 32 + offset]);
> +        T01 = _mm_loadu_si128((const __m128i*)&src[4 * 32 + offset + 4]);
>          in04[i]  = _mm_packs_epi32(T00, T01);
>
> -        T00 = _mm_loadu_si128((const __m128i*)&pSrc[5 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&pSrc[5 * 32 + offset + 4]);
> +        T00 = _mm_loadu_si128((const __m128i*)&src[5 * 32 + offset]);
> +        T01 = _mm_loadu_si128((const __m128i*)&src[5 * 32 + offset + 4]);
>          in05[i]  = _mm_packs_epi32(T00, T01);
>
> -        T00 = _mm_loadu_si128((const __m128i*)&pSrc[6 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&pSrc[6 * 32 + offset + 4]);
> +        T00 = _mm_loadu_si128((const __m128i*)&src[6 * 32 + offset]);
> +        T01 = _mm_loadu_si128((const __m128i*)&src[6 * 32 + offset + 4]);
>          in06[i]  = _mm_packs_epi32(T00, T01);
>
> -        T00 = _mm_loadu_si128((const __m128i*)&pSrc[7 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&pSrc[7 * 32 + offset + 4]);
> +        T00 = _mm_loadu_si128((const __m128i*)&src[7 * 32 + offset]);
> +        T01 = _mm_loadu_si128((const __m128i*)&src[7 * 32 + offset + 4]);
>          in07[i]  = _mm_packs_epi32(T00, T01);
>
> -        T00 = _mm_loadu_si128((const __m128i*)&pSrc[8 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&pSrc[8 * 32 + offset + 4]);
> +        T00 = _mm_loadu_si128((const __m128i*)&src[8 * 32 + offset]);
> +        T01 = _mm_loadu_si128((const __m128i*)&src[8 * 32 + offset + 4]);
>          in08[i]  = _mm_packs_epi32(T00, T01);
>
> -        T00 = _mm_loadu_si128((const __m128i*)&pSrc[9 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&pSrc[9 * 32 + offset + 4]);
> +        T00 = _mm_loadu_si128((const __m128i*)&src[9 * 32 + offset]);
> +        T01 = _mm_loadu_si128((const __m128i*)&src[9 * 32 + offset + 4]);
>          in09[i]  = _mm_packs_epi32(T00, T01);
>
> -        T00 = _mm_loadu_si128((const __m128i*)&pSrc[10 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&pSrc[10 * 32 + offset +
> 4]);
> +        T00 = _mm_loadu_si128((const __m128i*)&src[10 * 32 + offset]);
> +        T01 = _mm_loadu_si128((const __m128i*)&src[10 * 32 + offset + 4]);
>          in10[i]  = _mm_packs_epi32(T00, T01);
>
> -        T00 = _mm_loadu_si128((const __m128i*)&pSrc[11 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&pSrc[11 * 32 + offset +
> 4]);
> +        T00 = _mm_loadu_si128((const __m128i*)&src[11 * 32 + offset]);
> +        T01 = _mm_loadu_si128((const __m128i*)&src[11 * 32 + offset + 4]);
>          in11[i]  = _mm_packs_epi32(T00, T01);
>
> -        T00 = _mm_loadu_si128((const __m128i*)&pSrc[12 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&pSrc[12 * 32 + offset +
> 4]);
> +        T00 = _mm_loadu_si128((const __m128i*)&src[12 * 32 + offset]);
> +        T01 = _mm_loadu_si128((const __m128i*)&src[12 * 32 + offset + 4]);
>          in12[i]  = _mm_packs_epi32(T00, T01);
>
> -        T00 = _mm_loadu_si128((const __m128i*)&pSrc[13 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&pSrc[13 * 32 + offset +
> 4]);
> +        T00 = _mm_loadu_si128((const __m128i*)&src[13 * 32 + offset]);
> +        T01 = _mm_loadu_si128((const __m128i*)&src[13 * 32 + offset + 4]);
>          in13[i]  = _mm_packs_epi32(T00, T01);
>
> -        T00 = _mm_loadu_si128((const __m128i*)&pSrc[14 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&pSrc[14 * 32 + offset +
> 4]);
> +        T00 = _mm_loadu_si128((const __m128i*)&src[14 * 32 + offset]);
> +        T01 = _mm_loadu_si128((const __m128i*)&src[14 * 32 + offset + 4]);
>          in14[i]  = _mm_packs_epi32(T00, T01);
>
> -        T00 = _mm_loadu_si128((const __m128i*)&pSrc[15 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&pSrc[15 * 32 + offset +
> 4]);
> +        T00 = _mm_loadu_si128((const __m128i*)&src[15 * 32 + offset]);
> +        T01 = _mm_loadu_si128((const __m128i*)&src[15 * 32 + offset + 4]);
>          in15[i]  = _mm_packs_epi32(T00, T01);
>
> -        T00 = _mm_loadu_si128((const __m128i*)&pSrc[16 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&pSrc[16 * 32 + offset +
> 4]);
> +        T00 = _mm_loadu_si128((const __m128i*)&src[16 * 32 + offset]);
> +        T01 = _mm_loadu_si128((const __m128i*)&src[16 * 32 + offset + 4]);
>          in16[i]  = _mm_packs_epi32(T00, T01);
>
> -        T00 = _mm_loadu_si128((const __m128i*)&pSrc[17 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&pSrc[17 * 32 + offset +
> 4]);
> +        T00 = _mm_loadu_si128((const __m128i*)&src[17 * 32 + offset]);
> +        T01 = _mm_loadu_si128((const __m128i*)&src[17 * 32 + offset + 4]);
>          in17[i]  = _mm_packs_epi32(T00, T01);
>
> -        T00 = _mm_loadu_si128((const __m128i*)&pSrc[18 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&pSrc[18 * 32 + offset +
> 4]);
> +        T00 = _mm_loadu_si128((const __m128i*)&src[18 * 32 + offset]);
> +        T01 = _mm_loadu_si128((const __m128i*)&src[18 * 32 + offset + 4]);
>          in18[i]  = _mm_packs_epi32(T00, T01);
>
> -        T00 = _mm_loadu_si128((const __m128i*)&pSrc[19 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&pSrc[19 * 32 + offset +
> 4]);
> +        T00 = _mm_loadu_si128((const __m128i*)&src[19 * 32 + offset]);
> +        T01 = _mm_loadu_si128((const __m128i*)&src[19 * 32 + offset + 4]);
>          in19[i]  = _mm_packs_epi32(T00, T01);
>
> -        T00 = _mm_loadu_si128((const __m128i*)&pSrc[20 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&pSrc[20 * 32 + offset +
> 4]);
> +        T00 = _mm_loadu_si128((const __m128i*)&src[20 * 32 + offset]);
> +        T01 = _mm_loadu_si128((const __m128i*)&src[20 * 32 + offset + 4]);
>          in20[i]  = _mm_packs_epi32(T00, T01);
>
> -        T00 = _mm_loadu_si128((const __m128i*)&pSrc[21 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&pSrc[21 * 32 + offset +
> 4]);
> +        T00 = _mm_loadu_si128((const __m128i*)&src[21 * 32 + offset]);
> +        T01 = _mm_loadu_si128((const __m128i*)&src[21 * 32 + offset + 4]);
>          in21[i]  = _mm_packs_epi32(T00, T01);
>
> -        T00 = _mm_loadu_si128((const __m128i*)&pSrc[22 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&pSrc[22 * 32 + offset +
> 4]);
> +        T00 = _mm_loadu_si128((const __m128i*)&src[22 * 32 + offset]);
> +        T01 = _mm_loadu_si128((const __m128i*)&src[22 * 32 + offset + 4]);
>          in22[i]  = _mm_packs_epi32(T00, T01);
>
> -        T00 = _mm_loadu_si128((const __m128i*)&pSrc[23 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&pSrc[23 * 32 + offset +
> 4]);
> +        T00 = _mm_loadu_si128((const __m128i*)&src[23 * 32 + offset]);
> +        T01 = _mm_loadu_si128((const __m128i*)&src[23 * 32 + offset + 4]);
>          in23[i]  = _mm_packs_epi32(T00, T01);
>
> -        T00 = _mm_loadu_si128((const __m128i*)&pSrc[24 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&pSrc[24 * 32 + offset +
> 4]);
> +        T00 = _mm_loadu_si128((const __m128i*)&src[24 * 32 + offset]);
> +        T01 = _mm_loadu_si128((const __m128i*)&src[24 * 32 + offset + 4]);
>          in24[i]  = _mm_packs_epi32(T00, T01);
>
> -        T00 = _mm_loadu_si128((const __m128i*)&pSrc[25 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&pSrc[25 * 32 + offset +
> 4]);
> +        T00 = _mm_loadu_si128((const __m128i*)&src[25 * 32 + offset]);
> +        T01 = _mm_loadu_si128((const __m128i*)&src[25 * 32 + offset + 4]);
>          in25[i]  = _mm_packs_epi32(T00, T01);
>
> -        T00 = _mm_loadu_si128((const __m128i*)&pSrc[26 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&pSrc[26 * 32 + offset +
> 4]);
> +        T00 = _mm_loadu_si128((const __m128i*)&src[26 * 32 + offset]);
> +        T01 = _mm_loadu_si128((const __m128i*)&src[26 * 32 + offset + 4]);
>          in26[i]  = _mm_packs_epi32(T00, T01);
>
> -        T00 = _mm_loadu_si128((const __m128i*)&pSrc[27 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&pSrc[27 * 32 + offset +
> 4]);
> +        T00 = _mm_loadu_si128((const __m128i*)&src[27 * 32 + offset]);
> +        T01 = _mm_loadu_si128((const __m128i*)&src[27 * 32 + offset + 4]);
>          in27[i]  = _mm_packs_epi32(T00, T01);
>
> -        T00 = _mm_loadu_si128((const __m128i*)&pSrc[28 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&pSrc[28 * 32 + offset +
> 4]);
> +        T00 = _mm_loadu_si128((const __m128i*)&src[28 * 32 + offset]);
> +        T01 = _mm_loadu_si128((const __m128i*)&src[28 * 32 + offset + 4]);
>          in28[i]  = _mm_packs_epi32(T00, T01);
>
> -        T00 = _mm_loadu_si128((const __m128i*)&pSrc[29 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&pSrc[29 * 32 + offset +
> 4]);
> +        T00 = _mm_loadu_si128((const __m128i*)&src[29 * 32 + offset]);
> +        T01 = _mm_loadu_si128((const __m128i*)&src[29 * 32 + offset + 4]);
>          in29[i]  = _mm_packs_epi32(T00, T01);
>
> -        T00 = _mm_loadu_si128((const __m128i*)&pSrc[30 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&pSrc[30 * 32 + offset +
> 4]);
> +        T00 = _mm_loadu_si128((const __m128i*)&src[30 * 32 + offset]);
> +        T01 = _mm_loadu_si128((const __m128i*)&src[30 * 32 + offset + 4]);
>          in30[i]  = _mm_packs_epi32(T00, T01);
>
> -        T00 = _mm_loadu_si128((const __m128i*)&pSrc[31 * 32 + offset]);
> -        T01 = _mm_loadu_si128((const __m128i*)&pSrc[31 * 32 + offset +
> 4]);
> +        T00 = _mm_loadu_si128((const __m128i*)&src[31 * 32 + offset]);
> +        T01 = _mm_loadu_si128((const __m128i*)&src[31 * 32 + offset + 4]);
>          in31[i]  = _mm_packs_epi32(T00, T01);
>      }
>
> -    for (Int pass = 0; pass < 2; pass++)
> +    for (int pass = 0; pass < 2; pass++)
>      {
>          if (pass == 1)
>          {
> @@ -3432,7 +3432,7 @@
>              nShift  = 12;
>          }
>
> -        for (Int part = 0; part < 4; part++)
> +        for (int part = 0; part < 4; part++)
>          {
>              const __m128i T_00_00A = _mm_unpacklo_epi16(in01[part],
> in03[part]);       // [33 13 32 12 31 11 30 10]
>              const __m128i T_00_00B = _mm_unpackhi_epi16(in01[part],
> in03[part]);       // [37 17 36 16 35 15 34 14]
> @@ -3909,25 +3909,25 @@
>      }
>
>      // Add
> -    for (Int i = 0; i < 2; i++)
> +    for (int i = 0; i < 2; i++)
>      {
>  #define STROE_LINE(L0, L1, L2, L3, L4, L5, L6, L7, H0, H1, H2, H3, H4,
> H5, H6, H7, offsetV, offsetH) \
> -    _mm_storeu_si128((__m128i*)&pDst[(0 + (offsetV)) * stride + (offsetH)
> + 0], L0); \
> -    _mm_storeu_si128((__m128i*)&pDst[(0 + (offsetV)) * stride + (offsetH)
> + 8], H0); \
> -    _mm_storeu_si128((__m128i*)&pDst[(1 + (offsetV)) * stride + (offsetH)
> + 0], L1); \
> -    _mm_storeu_si128((__m128i*)&pDst[(1 + (offsetV)) * stride + (offsetH)
> + 8], H1); \
> -    _mm_storeu_si128((__m128i*)&pDst[(2 + (offsetV)) * stride + (offsetH)
> + 0], L2); \
> -    _mm_storeu_si128((__m128i*)&pDst[(2 + (offsetV)) * stride + (offsetH)
> + 8], H2); \
> -    _mm_storeu_si128((__m128i*)&pDst[(3 + (offsetV)) * stride + (offsetH)
> + 0], L3); \
> -    _mm_storeu_si128((__m128i*)&pDst[(3 + (offsetV)) * stride + (offsetH)
> + 8], H3); \
> -    _mm_storeu_si128((__m128i*)&pDst[(4 + (offsetV)) * stride + (offsetH)
> + 0], L4); \
> -    _mm_storeu_si128((__m128i*)&pDst[(4 + (offsetV)) * stride + (offsetH)
> + 8], H4); \
> -    _mm_storeu_si128((__m128i*)&pDst[(5 + (offsetV)) * stride + (offsetH)
> + 0], L5); \
> -    _mm_storeu_si128((__m128i*)&pDst[(5 + (offsetV)) * stride + (offsetH)
> + 8], H5); \
> -    _mm_storeu_si128((__m128i*)&pDst[(6 + (offsetV)) * stride + (offsetH)
> + 0], L6); \
> -    _mm_storeu_si128((__m128i*)&pDst[(6 + (offsetV)) * stride + (offsetH)
> + 8], H6); \
> -    _mm_storeu_si128((__m128i*)&pDst[(7 + (offsetV)) * stride + (offsetH)
> + 0], L7); \
> -    _mm_storeu_si128((__m128i*)&pDst[(7 + (offsetV)) * stride + (offsetH)
> + 8], H7);
> +    _mm_storeu_si128((__m128i*)&dst[(0 + (offsetV)) * stride + (offsetH)
> + 0], L0); \
> +    _mm_storeu_si128((__m128i*)&dst[(0 + (offsetV)) * stride + (offsetH)
> + 8], H0); \
> +    _mm_storeu_si128((__m128i*)&dst[(1 + (offsetV)) * stride + (offsetH)
> + 0], L1); \
> +    _mm_storeu_si128((__m128i*)&dst[(1 + (offsetV)) * stride + (offsetH)
> + 8], H1); \
> +    _mm_storeu_si128((__m128i*)&dst[(2 + (offsetV)) * stride + (offsetH)
> + 0], L2); \
> +    _mm_storeu_si128((__m128i*)&dst[(2 + (offsetV)) * stride + (offsetH)
> + 8], H2); \
> +    _mm_storeu_si128((__m128i*)&dst[(3 + (offsetV)) * stride + (offsetH)
> + 0], L3); \
> +    _mm_storeu_si128((__m128i*)&dst[(3 + (offsetV)) * stride + (offsetH)
> + 8], H3); \
> +    _mm_storeu_si128((__m128i*)&dst[(4 + (offsetV)) * stride + (offsetH)
> + 0], L4); \
> +    _mm_storeu_si128((__m128i*)&dst[(4 + (offsetV)) * stride + (offsetH)
> + 8], H4); \
> +    _mm_storeu_si128((__m128i*)&dst[(5 + (offsetV)) * stride + (offsetH)
> + 0], L5); \
> +    _mm_storeu_si128((__m128i*)&dst[(5 + (offsetV)) * stride + (offsetH)
> + 8], H5); \
> +    _mm_storeu_si128((__m128i*)&dst[(6 + (offsetV)) * stride + (offsetH)
> + 0], L6); \
> +    _mm_storeu_si128((__m128i*)&dst[(6 + (offsetV)) * stride + (offsetH)
> + 8], H6); \
> +    _mm_storeu_si128((__m128i*)&dst[(7 + (offsetV)) * stride + (offsetH)
> + 0], L7); \
> +    _mm_storeu_si128((__m128i*)&dst[(7 + (offsetV)) * stride + (offsetH)
> + 8], H7);
>
>          const int k = i * 2;
>          STROE_LINE(in00[k], in01[k], in02[k], in03[k], in04[k], in05[k],
> in06[k], in07[k], in00[k + 1], in01[k + 1], in02[k + 1], in03[k + 1],
> in04[k + 1], in05[k + 1], in06[k + 1], in07[k + 1], 0, i * 16)
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> http://mailman.videolan.org/listinfo/x265-devel
>



-- 
Steve Borho
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20130706/8034df8b/attachment-0001.html>


More information about the x265-devel mailing list