[x265] [PATCH] xIDCT8 intrinsic code cleanup
praveen at multicorewareinc.com
praveen at multicorewareinc.com
Wed Jul 3 10:32:43 CEST 2013
# HG changeset patch
# User praveentiwari
# Date 1372840341 -19800
# Node ID e585a6de675642cdd1884a2ac0e541b3cbb464b5
# Parent 0cb92ad723cedcd10fc2f7b4399a1db459585d72
xIDCT8 intrinsic code cleanup
diff -r 0cb92ad723ce -r e585a6de6756 source/common/vec/dct.inc
--- a/source/common/vec/dct.inc Wed Jul 03 13:51:11 2013 +0530
+++ b/source/common/vec/dct.inc Wed Jul 03 14:02:21 2013 +0530
@@ -2436,29 +2436,29 @@
{ 83, 36, 83, 36, 83, 36, 83, 36 },
{ 36, -83, 36, -83, 36, -83, 36, -83 }
};
-void xIDCT8(int *pSrc, short *pDst, intptr_t stride)
+void xIDCT8(int *src, short *dst, intptr_t stride)
{
__m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6, m128iS7, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2, m128Tmp3, E0h, E1h, E2h, E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O0l, O1l, O2l, O3l, EE0l, EE1l, E00l, E01l, EE0h, EE1h, E00h, E01h;
__m128i T00, T01, T02, T03, T04, T05, T06, T07;
m128iAdd = _mm_set1_epi32(64);
- T00 = _mm_load_si128((__m128i*)&pSrc[8 + 0]);
- T01 = _mm_load_si128((__m128i*)&pSrc[8 + 4]);
+ T00 = _mm_load_si128((__m128i*)&src[8 + 0]);
+ T01 = _mm_load_si128((__m128i*)&src[8 + 4]);
m128iS1 = _mm_packs_epi32(T00, T01);
- T00 = _mm_load_si128((__m128i*)&pSrc[24 + 0]);
- T01 = _mm_load_si128((__m128i*)&pSrc[24 + 4]);
+ T00 = _mm_load_si128((__m128i*)&src[24 + 0]);
+ T01 = _mm_load_si128((__m128i*)&src[24 + 4]);
m128iS3 = _mm_packs_epi32(T00, T01);
m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[0])));
m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[0])));
- T00 = _mm_load_si128((__m128i*)&pSrc[40 + 0]);
- T01 = _mm_load_si128((__m128i*)&pSrc[40 + 4]);
+ T00 = _mm_load_si128((__m128i*)&src[40 + 0]);
+ T01 = _mm_load_si128((__m128i*)&src[40 + 4]);
m128iS5 = _mm_packs_epi32(T00, T01);
- T00 = _mm_load_si128((__m128i*)&pSrc[56 + 0]);
- T01 = _mm_load_si128((__m128i*)&pSrc[56 + 4]);
+ T00 = _mm_load_si128((__m128i*)&src[56 + 0]);
+ T01 = _mm_load_si128((__m128i*)&src[56 + 4]);
m128iS7 = _mm_packs_epi32(T00, T01);
m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[1])));
@@ -2491,11 +2491,11 @@
/* ------- */
- T00 = _mm_load_si128((__m128i*)&pSrc[0 + 0]);
- T01 = _mm_load_si128((__m128i*)&pSrc[0 + 4]);
+ T00 = _mm_load_si128((__m128i*)&src[0 + 0]);
+ T01 = _mm_load_si128((__m128i*)&src[0 + 4]);
m128iS0 = _mm_packs_epi32(T00, T01);
- T00 = _mm_load_si128((__m128i*)&pSrc[32 + 0]);
- T01 = _mm_load_si128((__m128i*)&pSrc[32 + 4]);
+ T00 = _mm_load_si128((__m128i*)&src[32 + 0]);
+ T01 = _mm_load_si128((__m128i*)&src[32 + 4]);
m128iS4 = _mm_packs_epi32(T00, T01);
m128Tmp0 = _mm_unpacklo_epi16(m128iS0, m128iS4);
EE0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[8])));
@@ -2507,11 +2507,11 @@
/* ------- */
- T00 = _mm_load_si128((__m128i*)&pSrc[16 + 0]);
- T01 = _mm_load_si128((__m128i*)&pSrc[16 + 4]);
+ T00 = _mm_load_si128((__m128i*)&src[16 + 0]);
+ T01 = _mm_load_si128((__m128i*)&src[16 + 4]);
m128iS2 = _mm_packs_epi32(T00, T01);
- T00 = _mm_load_si128((__m128i*)&pSrc[48 + 0]);
- T01 = _mm_load_si128((__m128i*)&pSrc[48 + 4]);
+ T00 = _mm_load_si128((__m128i*)&src[48 + 0]);
+ T01 = _mm_load_si128((__m128i*)&src[48 + 4]);
m128iS6 = _mm_packs_epi32(T00, T01);
m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
E00l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[10])));
@@ -2662,31 +2662,31 @@
__m128i T10, T11;
T10 = _mm_unpacklo_epi32(T00, T02); // [31 21 11 01 30 20 10 00]
T11 = _mm_unpackhi_epi32(T00, T02); // [33 23 13 03 32 22 12 02]
- _mm_storel_epi64((__m128i*)&pDst[0 * stride + 0], T10); // [30 20 10 00]
- _mm_storeh_pi((__m64*)&pDst[1 * stride + 0], _mm_castsi128_ps(T10)); // [31 21 11 01]
- _mm_storel_epi64((__m128i*)&pDst[2 * stride + 0], T11); // [32 22 12 02]
- _mm_storeh_pi((__m64*)&pDst[3 * stride + 0], _mm_castsi128_ps(T11)); // [33 23 13 03]
+ _mm_storel_epi64((__m128i*)&dst[0 * stride + 0], T10); // [30 20 10 00]
+ _mm_storeh_pi((__m64*)&dst[1 * stride + 0], _mm_castsi128_ps(T10)); // [31 21 11 01]
+ _mm_storel_epi64((__m128i*)&dst[2 * stride + 0], T11); // [32 22 12 02]
+ _mm_storeh_pi((__m64*)&dst[3 * stride + 0], _mm_castsi128_ps(T11)); // [33 23 13 03]
T10 = _mm_unpacklo_epi32(T04, T06); // [71 61 51 41 70 60 50 40]
T11 = _mm_unpackhi_epi32(T04, T06); // [73 63 53 43 72 62 52 42]
- _mm_storel_epi64((__m128i*)&pDst[0 * stride + 4], T10);
- _mm_storeh_pi((__m64*)&pDst[1 * stride + 4], _mm_castsi128_ps(T10));
- _mm_storel_epi64((__m128i*)&pDst[2 * stride + 4], T11);
- _mm_storeh_pi((__m64*)&pDst[3 * stride + 4], _mm_castsi128_ps(T11));
+ _mm_storel_epi64((__m128i*)&dst[0 * stride + 4], T10);
+ _mm_storeh_pi((__m64*)&dst[1 * stride + 4], _mm_castsi128_ps(T10));
+ _mm_storel_epi64((__m128i*)&dst[2 * stride + 4], T11);
+ _mm_storeh_pi((__m64*)&dst[3 * stride + 4], _mm_castsi128_ps(T11));
T10 = _mm_unpacklo_epi32(T01, T03); // [35 25 15 05 34 24 14 04]
T11 = _mm_unpackhi_epi32(T01, T03); // [37 27 17 07 36 26 16 06]
- _mm_storel_epi64((__m128i*)&pDst[4 * stride + 0], T10);
- _mm_storeh_pi((__m64*)&pDst[5 * stride + 0], _mm_castsi128_ps(T10));
- _mm_storel_epi64((__m128i*)&pDst[6 * stride + 0], T11);
- _mm_storeh_pi((__m64*)&pDst[7 * stride + 0], _mm_castsi128_ps(T11));
+ _mm_storel_epi64((__m128i*)&dst[4 * stride + 0], T10);
+ _mm_storeh_pi((__m64*)&dst[5 * stride + 0], _mm_castsi128_ps(T10));
+ _mm_storel_epi64((__m128i*)&dst[6 * stride + 0], T11);
+ _mm_storeh_pi((__m64*)&dst[7 * stride + 0], _mm_castsi128_ps(T11));
T10 = _mm_unpacklo_epi32(T05, T07); // [75 65 55 45 74 64 54 44]
T11 = _mm_unpackhi_epi32(T05, T07); // [77 67 57 47 76 56 46 36]
- _mm_storel_epi64((__m128i*)&pDst[4 * stride + 4], T10);
- _mm_storeh_pi((__m64*)&pDst[5 * stride + 4], _mm_castsi128_ps(T10));
- _mm_storel_epi64((__m128i*)&pDst[6 * stride + 4], T11);
- _mm_storeh_pi((__m64*)&pDst[7 * stride + 4], _mm_castsi128_ps(T11));
+ _mm_storel_epi64((__m128i*)&dst[4 * stride + 4], T10);
+ _mm_storeh_pi((__m64*)&dst[5 * stride + 4], _mm_castsi128_ps(T10));
+ _mm_storel_epi64((__m128i*)&dst[6 * stride + 4], T11);
+ _mm_storeh_pi((__m64*)&dst[7 * stride + 4], _mm_castsi128_ps(T11));
}
void xIDCT16(int *pSrc, short *pDst, intptr_t stride)
More information about the x265-devel
mailing list