[x265] [PATCH] xDCT8 intrinsic code cleaup
praveen at multicorewareinc.com
praveen at multicorewareinc.com
Tue Jul 2 11:53:54 CEST 2013
# HG changeset patch
# User praveentiwari
# Date 1372758821 -19800
# Node ID 879245f5eedca52db300147368c523b8e12c33bc
# Parent fccd5390566cebffc8b5a6b7408384a11d5c215f
xDCT8 intrinsic code cleaup
diff -r fccd5390566c -r 879245f5eedc source/common/vec/dct.inc
--- a/source/common/vec/dct.inc Tue Jul 02 14:15:16 2013 +0530
+++ b/source/common/vec/dct.inc Tue Jul 02 15:23:41 2013 +0530
@@ -455,7 +455,7 @@
{ 50, -50, 75, -75, -89, 89, 18, -18 },
{ 18, -18, -89, 89, -50, 50, 75, -75 },
};
-void xDCT8(short *pSrc, int *pDst, intptr_t nStride)
+void xDCT8(short *src, int *dst, intptr_t nStride)
{
// Const
__m128i c_2 = _mm_set1_epi32(2);
@@ -469,14 +469,14 @@
__m128i T40, T41, T42, T43, T44, T45, T46, T47;
__m128i T50, T51, T52, T53, T54, T55, T56, T57;
- T00 = _mm_load_si128((__m128i*)&pSrc[0 * nStride]); // [07 06 05 04 03 02 01 00]
- T01 = _mm_load_si128((__m128i*)&pSrc[1 * nStride]); // [17 16 15 14 13 12 11 10]
- T02 = _mm_load_si128((__m128i*)&pSrc[2 * nStride]); // [27 26 25 24 23 22 21 20]
- T03 = _mm_load_si128((__m128i*)&pSrc[3 * nStride]); // [37 36 35 34 33 32 31 30]
- T04 = _mm_load_si128((__m128i*)&pSrc[4 * nStride]); // [47 46 45 44 43 42 41 40]
- T05 = _mm_load_si128((__m128i*)&pSrc[5 * nStride]); // [57 56 55 54 53 52 51 50]
- T06 = _mm_load_si128((__m128i*)&pSrc[6 * nStride]); // [67 66 65 64 63 62 61 60]
- T07 = _mm_load_si128((__m128i*)&pSrc[7 * nStride]); // [77 76 75 74 73 72 71 70]
+ T00 = _mm_load_si128((__m128i*)&src[0 * nStride]); // [07 06 05 04 03 02 01 00]
+ T01 = _mm_load_si128((__m128i*)&src[1 * nStride]); // [17 16 15 14 13 12 11 10]
+ T02 = _mm_load_si128((__m128i*)&src[2 * nStride]); // [27 26 25 24 23 22 21 20]
+ T03 = _mm_load_si128((__m128i*)&src[3 * nStride]); // [37 36 35 34 33 32 31 30]
+ T04 = _mm_load_si128((__m128i*)&src[4 * nStride]); // [47 46 45 44 43 42 41 40]
+ T05 = _mm_load_si128((__m128i*)&src[5 * nStride]); // [57 56 55 54 53 52 51 50]
+ T06 = _mm_load_si128((__m128i*)&src[6 * nStride]); // [67 66 65 64 63 62 61 60]
+ T07 = _mm_load_si128((__m128i*)&src[7 * nStride]); // [77 76 75 74 73 72 71 70]
T10 = _mm_shuffle_epi8(T00, _mm_load_si128((__m128i*)tab_dct_8[0])); // [05 02 06 01 04 03 07 00]
T11 = _mm_shuffle_epi8(T01, _mm_load_si128((__m128i*)tab_dct_8[0]));
@@ -600,12 +600,12 @@
T52 = _mm_srai_epi32(_mm_add_epi32(T42, c_256), 9);
T53 = _mm_srai_epi32(_mm_add_epi32(T43, c_256), 9);
- _mm_store_si128((__m128i*)&pDst[0 * 8 + 0], T50);
- _mm_store_si128((__m128i*)&pDst[0 * 8 + 4], T51);
- _mm_store_si128((__m128i*)&pDst[4 * 8 + 0], T52);
- _mm_store_si128((__m128i*)&pDst[4 * 8 + 4], T53);
-
-#define MAKE_ODD(tab, dst) \
+ _mm_store_si128((__m128i*)&dst[0 * 8 + 0], T50);
+ _mm_store_si128((__m128i*)&dst[0 * 8 + 4], T51);
+ _mm_store_si128((__m128i*)&dst[4 * 8 + 0], T52);
+ _mm_store_si128((__m128i*)&dst[4 * 8 + 4], T53);
+
+#define MAKE_ODD(tab, dstPos) \
T20 = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_8[(tab)])); \
T21 = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_8[(tab)])); \
T22 = _mm_madd_epi16(T12, _mm_load_si128((__m128i*)tab_dct_8[(tab)])); \
@@ -622,8 +622,8 @@
T41 = _mm_hadd_epi32(T32, T33); \
T50 = _mm_srai_epi32(_mm_add_epi32(T40, c_256), 9); \
T51 = _mm_srai_epi32(_mm_add_epi32(T41, c_256), 9); \
- _mm_store_si128((__m128i*)&pDst[(dst) * 8 + 0], T50); \
- _mm_store_si128((__m128i*)&pDst[(dst) * 8 + 4], T51);
+ _mm_store_si128((__m128i*)&dst[(dstPos) * 8 + 0], T50); \
+ _mm_store_si128((__m128i*)&dst[(dstPos) * 8 + 4], T51);
MAKE_ODD(9, 2);
MAKE_ODD(10, 6);
More information about the x265-devel
mailing list