[x265] [PATCH] xDCT32 intrinsic code cleanup
praveen at multicorewareinc.com
praveen at multicorewareinc.com
Wed Jul 3 08:32:00 CEST 2013
# HG changeset patch
# User praveentiwari
# Date 1372833108 -19800
# Node ID 88ee36317e5a6c3ff54b7fb394d2cb16ec549591
# Parent ec3d304af07bfeb5daffe886fea80a4ea4b94931
xDCT32 intrinsic code cleanup
diff -r ec3d304af07b -r 88ee36317e5a source/common/vec/dct.inc
--- a/source/common/vec/dct.inc Wed Jul 03 11:47:07 2013 +0530
+++ b/source/common/vec/dct.inc Wed Jul 03 12:01:48 2013 +0530
@@ -1722,7 +1722,7 @@
#undef MAKE_COEF16
};
-void xDCT32(short *pSrc, int *pDst, intptr_t nStride)
+void xDCT32(short *src, int *dst, intptr_t nStride)
{
// Const
__m128i c_8 = _mm_set1_epi32(8);
@@ -1746,38 +1746,38 @@
// DCT1
for (i = 0; i < 32 / 8; i++)
{
- T00A = _mm_load_si128((__m128i*)&pSrc[(i * 8 + 0) * nStride + 0]); // [07 06 05 04 03 02 01 00]
- T00B = _mm_load_si128((__m128i*)&pSrc[(i * 8 + 0) * nStride + 8]); // [15 14 13 12 11 10 09 08]
- T00C = _mm_load_si128((__m128i*)&pSrc[(i * 8 + 0) * nStride + 16]); // [23 22 21 20 19 18 17 16]
- T00D = _mm_load_si128((__m128i*)&pSrc[(i * 8 + 0) * nStride + 24]); // [31 30 29 28 27 26 25 24]
- T01A = _mm_load_si128((__m128i*)&pSrc[(i * 8 + 1) * nStride + 0]);
- T01B = _mm_load_si128((__m128i*)&pSrc[(i * 8 + 1) * nStride + 8]);
- T01C = _mm_load_si128((__m128i*)&pSrc[(i * 8 + 1) * nStride + 16]);
- T01D = _mm_load_si128((__m128i*)&pSrc[(i * 8 + 1) * nStride + 24]);
- T02A = _mm_load_si128((__m128i*)&pSrc[(i * 8 + 2) * nStride + 0]);
- T02B = _mm_load_si128((__m128i*)&pSrc[(i * 8 + 2) * nStride + 8]);
- T02C = _mm_load_si128((__m128i*)&pSrc[(i * 8 + 2) * nStride + 16]);
- T02D = _mm_load_si128((__m128i*)&pSrc[(i * 8 + 2) * nStride + 24]);
- T03A = _mm_load_si128((__m128i*)&pSrc[(i * 8 + 3) * nStride + 0]);
- T03B = _mm_load_si128((__m128i*)&pSrc[(i * 8 + 3) * nStride + 8]);
- T03C = _mm_load_si128((__m128i*)&pSrc[(i * 8 + 3) * nStride + 16]);
- T03D = _mm_load_si128((__m128i*)&pSrc[(i * 8 + 3) * nStride + 24]);
- T04A = _mm_load_si128((__m128i*)&pSrc[(i * 8 + 4) * nStride + 0]);
- T04B = _mm_load_si128((__m128i*)&pSrc[(i * 8 + 4) * nStride + 8]);
- T04C = _mm_load_si128((__m128i*)&pSrc[(i * 8 + 4) * nStride + 16]);
- T04D = _mm_load_si128((__m128i*)&pSrc[(i * 8 + 4) * nStride + 24]);
- T05A = _mm_load_si128((__m128i*)&pSrc[(i * 8 + 5) * nStride + 0]);
- T05B = _mm_load_si128((__m128i*)&pSrc[(i * 8 + 5) * nStride + 8]);
- T05C = _mm_load_si128((__m128i*)&pSrc[(i * 8 + 5) * nStride + 16]);
- T05D = _mm_load_si128((__m128i*)&pSrc[(i * 8 + 5) * nStride + 24]);
- T06A = _mm_load_si128((__m128i*)&pSrc[(i * 8 + 6) * nStride + 0]);
- T06B = _mm_load_si128((__m128i*)&pSrc[(i * 8 + 6) * nStride + 8]);
- T06C = _mm_load_si128((__m128i*)&pSrc[(i * 8 + 6) * nStride + 16]);
- T06D = _mm_load_si128((__m128i*)&pSrc[(i * 8 + 6) * nStride + 24]);
- T07A = _mm_load_si128((__m128i*)&pSrc[(i * 8 + 7) * nStride + 0]);
- T07B = _mm_load_si128((__m128i*)&pSrc[(i * 8 + 7) * nStride + 8]);
- T07C = _mm_load_si128((__m128i*)&pSrc[(i * 8 + 7) * nStride + 16]);
- T07D = _mm_load_si128((__m128i*)&pSrc[(i * 8 + 7) * nStride + 24]);
+ T00A = _mm_load_si128((__m128i*)&src[(i * 8 + 0) * nStride + 0]); // [07 06 05 04 03 02 01 00]
+ T00B = _mm_load_si128((__m128i*)&src[(i * 8 + 0) * nStride + 8]); // [15 14 13 12 11 10 09 08]
+ T00C = _mm_load_si128((__m128i*)&src[(i * 8 + 0) * nStride + 16]); // [23 22 21 20 19 18 17 16]
+ T00D = _mm_load_si128((__m128i*)&src[(i * 8 + 0) * nStride + 24]); // [31 30 29 28 27 26 25 24]
+ T01A = _mm_load_si128((__m128i*)&src[(i * 8 + 1) * nStride + 0]);
+ T01B = _mm_load_si128((__m128i*)&src[(i * 8 + 1) * nStride + 8]);
+ T01C = _mm_load_si128((__m128i*)&src[(i * 8 + 1) * nStride + 16]);
+ T01D = _mm_load_si128((__m128i*)&src[(i * 8 + 1) * nStride + 24]);
+ T02A = _mm_load_si128((__m128i*)&src[(i * 8 + 2) * nStride + 0]);
+ T02B = _mm_load_si128((__m128i*)&src[(i * 8 + 2) * nStride + 8]);
+ T02C = _mm_load_si128((__m128i*)&src[(i * 8 + 2) * nStride + 16]);
+ T02D = _mm_load_si128((__m128i*)&src[(i * 8 + 2) * nStride + 24]);
+ T03A = _mm_load_si128((__m128i*)&src[(i * 8 + 3) * nStride + 0]);
+ T03B = _mm_load_si128((__m128i*)&src[(i * 8 + 3) * nStride + 8]);
+ T03C = _mm_load_si128((__m128i*)&src[(i * 8 + 3) * nStride + 16]);
+ T03D = _mm_load_si128((__m128i*)&src[(i * 8 + 3) * nStride + 24]);
+ T04A = _mm_load_si128((__m128i*)&src[(i * 8 + 4) * nStride + 0]);
+ T04B = _mm_load_si128((__m128i*)&src[(i * 8 + 4) * nStride + 8]);
+ T04C = _mm_load_si128((__m128i*)&src[(i * 8 + 4) * nStride + 16]);
+ T04D = _mm_load_si128((__m128i*)&src[(i * 8 + 4) * nStride + 24]);
+ T05A = _mm_load_si128((__m128i*)&src[(i * 8 + 5) * nStride + 0]);
+ T05B = _mm_load_si128((__m128i*)&src[(i * 8 + 5) * nStride + 8]);
+ T05C = _mm_load_si128((__m128i*)&src[(i * 8 + 5) * nStride + 16]);
+ T05D = _mm_load_si128((__m128i*)&src[(i * 8 + 5) * nStride + 24]);
+ T06A = _mm_load_si128((__m128i*)&src[(i * 8 + 6) * nStride + 0]);
+ T06B = _mm_load_si128((__m128i*)&src[(i * 8 + 6) * nStride + 8]);
+ T06C = _mm_load_si128((__m128i*)&src[(i * 8 + 6) * nStride + 16]);
+ T06D = _mm_load_si128((__m128i*)&src[(i * 8 + 6) * nStride + 24]);
+ T07A = _mm_load_si128((__m128i*)&src[(i * 8 + 7) * nStride + 0]);
+ T07B = _mm_load_si128((__m128i*)&src[(i * 8 + 7) * nStride + 8]);
+ T07C = _mm_load_si128((__m128i*)&src[(i * 8 + 7) * nStride + 16]);
+ T07D = _mm_load_si128((__m128i*)&src[(i * 8 + 7) * nStride + 24]);
T00A = _mm_shuffle_epi8(T00A, _mm_load_si128((__m128i*)tab_dct_16_0[1])); // [05 02 06 01 04 03 07 00]
T00B = _mm_shuffle_epi8(T00B, _mm_load_si128((__m128i*)tab_dct_32_0[0])); // [10 13 09 14 11 12 08 15]
@@ -1925,7 +1925,7 @@
T60 = _mm_packs_epi32(T50, T51);
im[24][i] = T60;
-#define MAKE_ODD(tab, dst) \
+#define MAKE_ODD(tab, dstPos) \
T30 = _mm_madd_epi16(T20, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \
T31 = _mm_madd_epi16(T21, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \
T32 = _mm_madd_epi16(T22, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \
@@ -1945,7 +1945,7 @@
T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), 4); \
T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), 4); \
T60 = _mm_packs_epi32(T50, T51); \
- im[(dst)][i] = T60;
+ im[(dstPos)][i] = T60;
MAKE_ODD(0, 4);
MAKE_ODD(1, 12);
@@ -1971,7 +1971,7 @@
MAKE_ODD(11, 30);
#undef MAKE_ODD
-#define MAKE_ODD(tab, dst) \
+#define MAKE_ODD(tab, dstPos) \
T20 = _mm_madd_epi16(T00A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \
T21 = _mm_madd_epi16(T00B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) + 1])); \
T22 = _mm_madd_epi16(T01A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \
@@ -2008,7 +2008,7 @@
T50 = _mm_srai_epi32(_mm_add_epi32(T50, c_8), 4); \
T51 = _mm_srai_epi32(_mm_add_epi32(T51, c_8), 4); \
T60 = _mm_packs_epi32(T50, T51); \
- im[(dst)][i] = T60;
+ im[(dstPos)][i] = T60;
MAKE_ODD(12, 1);
MAKE_ODD(14, 3);
@@ -2077,7 +2077,7 @@
T03A = _mm_unpacklo_epi16(T03B, T03C);
T03B = _mm_unpackhi_epi16(T03B, T03C);
-#define MAKE_ODD(tab0, tab1, tab2, tab3, dst) \
+#define MAKE_ODD(tab0, tab1, tab2, tab3, dstPos) \
T20 = _mm_madd_epi16(T10A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab0)])); \
T21 = _mm_madd_epi16(T10B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab1)])); \
T22 = _mm_madd_epi16(T00A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab2)])); \
@@ -2115,7 +2115,7 @@
T60 = _mm_hadd_epi32(T60, T61); \
\
T60 = _mm_srai_epi32(_mm_add_epi32(T60, c_1024), 11); \
- _mm_storeu_si128((__m128i*)&pDst[(dst) * 32 + (i * 4) + 0], T60); \
+ _mm_storeu_si128((__m128i*)&dst[(dstPos) * 32 + (i * 4) + 0], T60); \
MAKE_ODD(44, 44, 44, 44, 0);
MAKE_ODD(45, 45, 45, 45, 16);
More information about the x265-devel
mailing list