[x265] [PATCH] xIDCT32 code cleanup
praveen at multicorewareinc.com
praveen at multicorewareinc.com
Fri Jul 5 11:50:54 CEST 2013
# HG changeset patch
# User praveentiwari
# Date 1373017843 -19800
# Node ID e3e4ed95a58af4e69a7494b2ecddcec0e07b3513
# Parent 21934e9f52130a7dec6ea3f7a96ad99d023d61c3
xIDCT32 code cleanup
diff -r 21934e9f5213 -r e3e4ed95a58a source/common/vec/dct.inc
--- a/source/common/vec/dct.inc Fri Jul 05 15:14:04 2013 +0530
+++ b/source/common/vec/dct.inc Fri Jul 05 15:20:43 2013 +0530
@@ -3100,7 +3100,7 @@
_mm_store_si128((__m128i*)&dst[15 * stride + 8], in15[1]);
}
-void xIDCT32(int *pSrc, short *pDst, intptr_t stride)
+void xIDCT32(int *src, short *dst, intptr_t stride)
{
//Odd
const __m128i c16_p90_p90 = _mm_set1_epi32(0x005A005A); //column 0
@@ -3282,7 +3282,7 @@
const __m128i c16_p64_p64 = _mm_set1_epi32(0x00400040);
__m128i c32_rnd = _mm_set1_epi32(64);
- Int nShift = 7;
+ int nShift = 7;
// DCT1
__m128i in00[4], in01[4], in02[4], in03[4], in04[4], in05[4], in06[4], in07[4], in08[4], in09[4], in10[4], in11[4], in12[4], in13[4], in14[4], in15[4];
@@ -3290,141 +3290,141 @@
__m128i res00[4], res01[4], res02[4], res03[4], res04[4], res05[4], res06[4], res07[4], res08[4], res09[4], res10[4], res11[4], res12[4], res13[4], res14[4], res15[4];
__m128i res16[4], res17[4], res18[4], res19[4], res20[4], res21[4], res22[4], res23[4], res24[4], res25[4], res26[4], res27[4], res28[4], res29[4], res30[4], res31[4];
- for (Int i = 0; i < 4; i++)
+ for (int i = 0; i < 4; i++)
{
const int offset = (i << 3);
__m128i T00, T01;
- T00 = _mm_loadu_si128((const __m128i*)&pSrc[0 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&pSrc[0 * 32 + offset + 4]);
+ T00 = _mm_loadu_si128((const __m128i*)&src[0 * 32 + offset]);
+ T01 = _mm_loadu_si128((const __m128i*)&src[0 * 32 + offset + 4]);
in00[i] = _mm_packs_epi32(T00, T01);
- T00 = _mm_loadu_si128((const __m128i*)&pSrc[1 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&pSrc[1 * 32 + offset + 4]);
+ T00 = _mm_loadu_si128((const __m128i*)&src[1 * 32 + offset]);
+ T01 = _mm_loadu_si128((const __m128i*)&src[1 * 32 + offset + 4]);
in01[i] = _mm_packs_epi32(T00, T01);
- T00 = _mm_loadu_si128((const __m128i*)&pSrc[2 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&pSrc[2 * 32 + offset + 4]);
+ T00 = _mm_loadu_si128((const __m128i*)&src[2 * 32 + offset]);
+ T01 = _mm_loadu_si128((const __m128i*)&src[2 * 32 + offset + 4]);
in02[i] = _mm_packs_epi32(T00, T01);
- T00 = _mm_loadu_si128((const __m128i*)&pSrc[3 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&pSrc[3 * 32 + offset + 4]);
+ T00 = _mm_loadu_si128((const __m128i*)&src[3 * 32 + offset]);
+ T01 = _mm_loadu_si128((const __m128i*)&src[3 * 32 + offset + 4]);
in03[i] = _mm_packs_epi32(T00, T01);
- T00 = _mm_loadu_si128((const __m128i*)&pSrc[4 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&pSrc[4 * 32 + offset + 4]);
+ T00 = _mm_loadu_si128((const __m128i*)&src[4 * 32 + offset]);
+ T01 = _mm_loadu_si128((const __m128i*)&src[4 * 32 + offset + 4]);
in04[i] = _mm_packs_epi32(T00, T01);
- T00 = _mm_loadu_si128((const __m128i*)&pSrc[5 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&pSrc[5 * 32 + offset + 4]);
+ T00 = _mm_loadu_si128((const __m128i*)&src[5 * 32 + offset]);
+ T01 = _mm_loadu_si128((const __m128i*)&src[5 * 32 + offset + 4]);
in05[i] = _mm_packs_epi32(T00, T01);
- T00 = _mm_loadu_si128((const __m128i*)&pSrc[6 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&pSrc[6 * 32 + offset + 4]);
+ T00 = _mm_loadu_si128((const __m128i*)&src[6 * 32 + offset]);
+ T01 = _mm_loadu_si128((const __m128i*)&src[6 * 32 + offset + 4]);
in06[i] = _mm_packs_epi32(T00, T01);
- T00 = _mm_loadu_si128((const __m128i*)&pSrc[7 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&pSrc[7 * 32 + offset + 4]);
+ T00 = _mm_loadu_si128((const __m128i*)&src[7 * 32 + offset]);
+ T01 = _mm_loadu_si128((const __m128i*)&src[7 * 32 + offset + 4]);
in07[i] = _mm_packs_epi32(T00, T01);
- T00 = _mm_loadu_si128((const __m128i*)&pSrc[8 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&pSrc[8 * 32 + offset + 4]);
+ T00 = _mm_loadu_si128((const __m128i*)&src[8 * 32 + offset]);
+ T01 = _mm_loadu_si128((const __m128i*)&src[8 * 32 + offset + 4]);
in08[i] = _mm_packs_epi32(T00, T01);
- T00 = _mm_loadu_si128((const __m128i*)&pSrc[9 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&pSrc[9 * 32 + offset + 4]);
+ T00 = _mm_loadu_si128((const __m128i*)&src[9 * 32 + offset]);
+ T01 = _mm_loadu_si128((const __m128i*)&src[9 * 32 + offset + 4]);
in09[i] = _mm_packs_epi32(T00, T01);
- T00 = _mm_loadu_si128((const __m128i*)&pSrc[10 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&pSrc[10 * 32 + offset + 4]);
+ T00 = _mm_loadu_si128((const __m128i*)&src[10 * 32 + offset]);
+ T01 = _mm_loadu_si128((const __m128i*)&src[10 * 32 + offset + 4]);
in10[i] = _mm_packs_epi32(T00, T01);
- T00 = _mm_loadu_si128((const __m128i*)&pSrc[11 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&pSrc[11 * 32 + offset + 4]);
+ T00 = _mm_loadu_si128((const __m128i*)&src[11 * 32 + offset]);
+ T01 = _mm_loadu_si128((const __m128i*)&src[11 * 32 + offset + 4]);
in11[i] = _mm_packs_epi32(T00, T01);
- T00 = _mm_loadu_si128((const __m128i*)&pSrc[12 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&pSrc[12 * 32 + offset + 4]);
+ T00 = _mm_loadu_si128((const __m128i*)&src[12 * 32 + offset]);
+ T01 = _mm_loadu_si128((const __m128i*)&src[12 * 32 + offset + 4]);
in12[i] = _mm_packs_epi32(T00, T01);
- T00 = _mm_loadu_si128((const __m128i*)&pSrc[13 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&pSrc[13 * 32 + offset + 4]);
+ T00 = _mm_loadu_si128((const __m128i*)&src[13 * 32 + offset]);
+ T01 = _mm_loadu_si128((const __m128i*)&src[13 * 32 + offset + 4]);
in13[i] = _mm_packs_epi32(T00, T01);
- T00 = _mm_loadu_si128((const __m128i*)&pSrc[14 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&pSrc[14 * 32 + offset + 4]);
+ T00 = _mm_loadu_si128((const __m128i*)&src[14 * 32 + offset]);
+ T01 = _mm_loadu_si128((const __m128i*)&src[14 * 32 + offset + 4]);
in14[i] = _mm_packs_epi32(T00, T01);
- T00 = _mm_loadu_si128((const __m128i*)&pSrc[15 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&pSrc[15 * 32 + offset + 4]);
+ T00 = _mm_loadu_si128((const __m128i*)&src[15 * 32 + offset]);
+ T01 = _mm_loadu_si128((const __m128i*)&src[15 * 32 + offset + 4]);
in15[i] = _mm_packs_epi32(T00, T01);
- T00 = _mm_loadu_si128((const __m128i*)&pSrc[16 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&pSrc[16 * 32 + offset + 4]);
+ T00 = _mm_loadu_si128((const __m128i*)&src[16 * 32 + offset]);
+ T01 = _mm_loadu_si128((const __m128i*)&src[16 * 32 + offset + 4]);
in16[i] = _mm_packs_epi32(T00, T01);
- T00 = _mm_loadu_si128((const __m128i*)&pSrc[17 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&pSrc[17 * 32 + offset + 4]);
+ T00 = _mm_loadu_si128((const __m128i*)&src[17 * 32 + offset]);
+ T01 = _mm_loadu_si128((const __m128i*)&src[17 * 32 + offset + 4]);
in17[i] = _mm_packs_epi32(T00, T01);
- T00 = _mm_loadu_si128((const __m128i*)&pSrc[18 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&pSrc[18 * 32 + offset + 4]);
+ T00 = _mm_loadu_si128((const __m128i*)&src[18 * 32 + offset]);
+ T01 = _mm_loadu_si128((const __m128i*)&src[18 * 32 + offset + 4]);
in18[i] = _mm_packs_epi32(T00, T01);
- T00 = _mm_loadu_si128((const __m128i*)&pSrc[19 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&pSrc[19 * 32 + offset + 4]);
+ T00 = _mm_loadu_si128((const __m128i*)&src[19 * 32 + offset]);
+ T01 = _mm_loadu_si128((const __m128i*)&src[19 * 32 + offset + 4]);
in19[i] = _mm_packs_epi32(T00, T01);
- T00 = _mm_loadu_si128((const __m128i*)&pSrc[20 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&pSrc[20 * 32 + offset + 4]);
+ T00 = _mm_loadu_si128((const __m128i*)&src[20 * 32 + offset]);
+ T01 = _mm_loadu_si128((const __m128i*)&src[20 * 32 + offset + 4]);
in20[i] = _mm_packs_epi32(T00, T01);
- T00 = _mm_loadu_si128((const __m128i*)&pSrc[21 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&pSrc[21 * 32 + offset + 4]);
+ T00 = _mm_loadu_si128((const __m128i*)&src[21 * 32 + offset]);
+ T01 = _mm_loadu_si128((const __m128i*)&src[21 * 32 + offset + 4]);
in21[i] = _mm_packs_epi32(T00, T01);
- T00 = _mm_loadu_si128((const __m128i*)&pSrc[22 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&pSrc[22 * 32 + offset + 4]);
+ T00 = _mm_loadu_si128((const __m128i*)&src[22 * 32 + offset]);
+ T01 = _mm_loadu_si128((const __m128i*)&src[22 * 32 + offset + 4]);
in22[i] = _mm_packs_epi32(T00, T01);
- T00 = _mm_loadu_si128((const __m128i*)&pSrc[23 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&pSrc[23 * 32 + offset + 4]);
+ T00 = _mm_loadu_si128((const __m128i*)&src[23 * 32 + offset]);
+ T01 = _mm_loadu_si128((const __m128i*)&src[23 * 32 + offset + 4]);
in23[i] = _mm_packs_epi32(T00, T01);
- T00 = _mm_loadu_si128((const __m128i*)&pSrc[24 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&pSrc[24 * 32 + offset + 4]);
+ T00 = _mm_loadu_si128((const __m128i*)&src[24 * 32 + offset]);
+ T01 = _mm_loadu_si128((const __m128i*)&src[24 * 32 + offset + 4]);
in24[i] = _mm_packs_epi32(T00, T01);
- T00 = _mm_loadu_si128((const __m128i*)&pSrc[25 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&pSrc[25 * 32 + offset + 4]);
+ T00 = _mm_loadu_si128((const __m128i*)&src[25 * 32 + offset]);
+ T01 = _mm_loadu_si128((const __m128i*)&src[25 * 32 + offset + 4]);
in25[i] = _mm_packs_epi32(T00, T01);
- T00 = _mm_loadu_si128((const __m128i*)&pSrc[26 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&pSrc[26 * 32 + offset + 4]);
+ T00 = _mm_loadu_si128((const __m128i*)&src[26 * 32 + offset]);
+ T01 = _mm_loadu_si128((const __m128i*)&src[26 * 32 + offset + 4]);
in26[i] = _mm_packs_epi32(T00, T01);
- T00 = _mm_loadu_si128((const __m128i*)&pSrc[27 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&pSrc[27 * 32 + offset + 4]);
+ T00 = _mm_loadu_si128((const __m128i*)&src[27 * 32 + offset]);
+ T01 = _mm_loadu_si128((const __m128i*)&src[27 * 32 + offset + 4]);
in27[i] = _mm_packs_epi32(T00, T01);
- T00 = _mm_loadu_si128((const __m128i*)&pSrc[28 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&pSrc[28 * 32 + offset + 4]);
+ T00 = _mm_loadu_si128((const __m128i*)&src[28 * 32 + offset]);
+ T01 = _mm_loadu_si128((const __m128i*)&src[28 * 32 + offset + 4]);
in28[i] = _mm_packs_epi32(T00, T01);
- T00 = _mm_loadu_si128((const __m128i*)&pSrc[29 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&pSrc[29 * 32 + offset + 4]);
+ T00 = _mm_loadu_si128((const __m128i*)&src[29 * 32 + offset]);
+ T01 = _mm_loadu_si128((const __m128i*)&src[29 * 32 + offset + 4]);
in29[i] = _mm_packs_epi32(T00, T01);
- T00 = _mm_loadu_si128((const __m128i*)&pSrc[30 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&pSrc[30 * 32 + offset + 4]);
+ T00 = _mm_loadu_si128((const __m128i*)&src[30 * 32 + offset]);
+ T01 = _mm_loadu_si128((const __m128i*)&src[30 * 32 + offset + 4]);
in30[i] = _mm_packs_epi32(T00, T01);
- T00 = _mm_loadu_si128((const __m128i*)&pSrc[31 * 32 + offset]);
- T01 = _mm_loadu_si128((const __m128i*)&pSrc[31 * 32 + offset + 4]);
+ T00 = _mm_loadu_si128((const __m128i*)&src[31 * 32 + offset]);
+ T01 = _mm_loadu_si128((const __m128i*)&src[31 * 32 + offset + 4]);
in31[i] = _mm_packs_epi32(T00, T01);
}
- for (Int pass = 0; pass < 2; pass++)
+ for (int pass = 0; pass < 2; pass++)
{
if (pass == 1)
{
@@ -3432,7 +3432,7 @@
nShift = 12;
}
- for (Int part = 0; part < 4; part++)
+ for (int part = 0; part < 4; part++)
{
const __m128i T_00_00A = _mm_unpacklo_epi16(in01[part], in03[part]); // [33 13 32 12 31 11 30 10]
const __m128i T_00_00B = _mm_unpackhi_epi16(in01[part], in03[part]); // [37 17 36 16 35 15 34 14]
@@ -3909,25 +3909,25 @@
}
// Add
- for (Int i = 0; i < 2; i++)
+ for (int i = 0; i < 2; i++)
{
#define STROE_LINE(L0, L1, L2, L3, L4, L5, L6, L7, H0, H1, H2, H3, H4, H5, H6, H7, offsetV, offsetH) \
- _mm_storeu_si128((__m128i*)&pDst[(0 + (offsetV)) * stride + (offsetH) + 0], L0); \
- _mm_storeu_si128((__m128i*)&pDst[(0 + (offsetV)) * stride + (offsetH) + 8], H0); \
- _mm_storeu_si128((__m128i*)&pDst[(1 + (offsetV)) * stride + (offsetH) + 0], L1); \
- _mm_storeu_si128((__m128i*)&pDst[(1 + (offsetV)) * stride + (offsetH) + 8], H1); \
- _mm_storeu_si128((__m128i*)&pDst[(2 + (offsetV)) * stride + (offsetH) + 0], L2); \
- _mm_storeu_si128((__m128i*)&pDst[(2 + (offsetV)) * stride + (offsetH) + 8], H2); \
- _mm_storeu_si128((__m128i*)&pDst[(3 + (offsetV)) * stride + (offsetH) + 0], L3); \
- _mm_storeu_si128((__m128i*)&pDst[(3 + (offsetV)) * stride + (offsetH) + 8], H3); \
- _mm_storeu_si128((__m128i*)&pDst[(4 + (offsetV)) * stride + (offsetH) + 0], L4); \
- _mm_storeu_si128((__m128i*)&pDst[(4 + (offsetV)) * stride + (offsetH) + 8], H4); \
- _mm_storeu_si128((__m128i*)&pDst[(5 + (offsetV)) * stride + (offsetH) + 0], L5); \
- _mm_storeu_si128((__m128i*)&pDst[(5 + (offsetV)) * stride + (offsetH) + 8], H5); \
- _mm_storeu_si128((__m128i*)&pDst[(6 + (offsetV)) * stride + (offsetH) + 0], L6); \
- _mm_storeu_si128((__m128i*)&pDst[(6 + (offsetV)) * stride + (offsetH) + 8], H6); \
- _mm_storeu_si128((__m128i*)&pDst[(7 + (offsetV)) * stride + (offsetH) + 0], L7); \
- _mm_storeu_si128((__m128i*)&pDst[(7 + (offsetV)) * stride + (offsetH) + 8], H7);
+ _mm_storeu_si128((__m128i*)&dst[(0 + (offsetV)) * stride + (offsetH) + 0], L0); \
+ _mm_storeu_si128((__m128i*)&dst[(0 + (offsetV)) * stride + (offsetH) + 8], H0); \
+ _mm_storeu_si128((__m128i*)&dst[(1 + (offsetV)) * stride + (offsetH) + 0], L1); \
+ _mm_storeu_si128((__m128i*)&dst[(1 + (offsetV)) * stride + (offsetH) + 8], H1); \
+ _mm_storeu_si128((__m128i*)&dst[(2 + (offsetV)) * stride + (offsetH) + 0], L2); \
+ _mm_storeu_si128((__m128i*)&dst[(2 + (offsetV)) * stride + (offsetH) + 8], H2); \
+ _mm_storeu_si128((__m128i*)&dst[(3 + (offsetV)) * stride + (offsetH) + 0], L3); \
+ _mm_storeu_si128((__m128i*)&dst[(3 + (offsetV)) * stride + (offsetH) + 8], H3); \
+ _mm_storeu_si128((__m128i*)&dst[(4 + (offsetV)) * stride + (offsetH) + 0], L4); \
+ _mm_storeu_si128((__m128i*)&dst[(4 + (offsetV)) * stride + (offsetH) + 8], H4); \
+ _mm_storeu_si128((__m128i*)&dst[(5 + (offsetV)) * stride + (offsetH) + 0], L5); \
+ _mm_storeu_si128((__m128i*)&dst[(5 + (offsetV)) * stride + (offsetH) + 8], H5); \
+ _mm_storeu_si128((__m128i*)&dst[(6 + (offsetV)) * stride + (offsetH) + 0], L6); \
+ _mm_storeu_si128((__m128i*)&dst[(6 + (offsetV)) * stride + (offsetH) + 8], H6); \
+ _mm_storeu_si128((__m128i*)&dst[(7 + (offsetV)) * stride + (offsetH) + 0], L7); \
+ _mm_storeu_si128((__m128i*)&dst[(7 + (offsetV)) * stride + (offsetH) + 8], H7);
const int k = i * 2;
STROE_LINE(in00[k], in01[k], in02[k], in03[k], in04[k], in05[k], in06[k], in07[k], in00[k + 1], in01[k + 1], in02[k + 1], in03[k + 1], in04[k + 1], in05[k + 1], in06[k + 1], in07[k + 1], 0, i * 16)
More information about the x265-devel
mailing list