[x265] [PATCH] xDCT16 intrinsic code cleanup

praveen at multicorewareinc.com praveen at multicorewareinc.com
Wed Jul 3 08:12:25 CEST 2013


# HG changeset patch
# User praveentiwari
# Date 1372831931 -19800
# Node ID eda2a00987552d3e3bbd835cc894717197dcecd7
# Parent  e59400fe1240306767820268ab342031bbbdcf6a
xDCT16 intrinsic code cleanup

diff -r e59400fe1240 -r eda2a0098755 source/common/vec/dct.inc
--- a/source/common/vec/dct.inc	Wed Jul 03 10:50:23 2013 +0530
+++ b/source/common/vec/dct.inc	Wed Jul 03 11:42:11 2013 +0530
@@ -855,7 +855,7 @@
 #undef MAKE_COEF
 };
 
-void xDCT16(short *pSrc, int *pDst, intptr_t nStride)
+void xDCT16(short *src, int *dst, intptr_t nStride)
 {
     // Const
     __m128i c_4     = _mm_set1_epi32(4);
@@ -878,22 +878,22 @@
     // DCT1
     for (i = 0; i < 16; i += 8)
     {
-        T00A = _mm_load_si128((__m128i*)&pSrc[(i + 0) * nStride + 0]);    // [07 06 05 04 03 02 01 00]
-        T00B = _mm_load_si128((__m128i*)&pSrc[(i + 0) * nStride + 8]);    // [0F 0E 0D 0C 0B 0A 09 08]
-        T01A = _mm_load_si128((__m128i*)&pSrc[(i + 1) * nStride + 0]);    // [17 16 15 14 13 12 11 10]
-        T01B = _mm_load_si128((__m128i*)&pSrc[(i + 1) * nStride + 8]);    // [1F 1E 1D 1C 1B 1A 19 18]
-        T02A = _mm_load_si128((__m128i*)&pSrc[(i + 2) * nStride + 0]);    // [27 26 25 24 23 22 21 20]
-        T02B = _mm_load_si128((__m128i*)&pSrc[(i + 2) * nStride + 8]);    // [2F 2E 2D 2C 2B 2A 29 28]
-        T03A = _mm_load_si128((__m128i*)&pSrc[(i + 3) * nStride + 0]);    // [37 36 35 34 33 32 31 30]
-        T03B = _mm_load_si128((__m128i*)&pSrc[(i + 3) * nStride + 8]);    // [3F 3E 3D 3C 3B 3A 39 38]
-        T04A = _mm_load_si128((__m128i*)&pSrc[(i + 4) * nStride + 0]);    // [47 46 45 44 43 42 41 40]
-        T04B = _mm_load_si128((__m128i*)&pSrc[(i + 4) * nStride + 8]);    // [4F 4E 4D 4C 4B 4A 49 48]
-        T05A = _mm_load_si128((__m128i*)&pSrc[(i + 5) * nStride + 0]);    // [57 56 55 54 53 52 51 50]
-        T05B = _mm_load_si128((__m128i*)&pSrc[(i + 5) * nStride + 8]);    // [5F 5E 5D 5C 5B 5A 59 58]
-        T06A = _mm_load_si128((__m128i*)&pSrc[(i + 6) * nStride + 0]);    // [67 66 65 64 63 62 61 60]
-        T06B = _mm_load_si128((__m128i*)&pSrc[(i + 6) * nStride + 8]);    // [6F 6E 6D 6C 6B 6A 69 68]
-        T07A = _mm_load_si128((__m128i*)&pSrc[(i + 7) * nStride + 0]);    // [77 76 75 74 73 72 71 70]
-        T07B = _mm_load_si128((__m128i*)&pSrc[(i + 7) * nStride + 8]);    // [7F 7E 7D 7C 7B 7A 79 78]
+        T00A = _mm_load_si128((__m128i*)&src[(i + 0) * nStride + 0]);    // [07 06 05 04 03 02 01 00]
+        T00B = _mm_load_si128((__m128i*)&src[(i + 0) * nStride + 8]);    // [0F 0E 0D 0C 0B 0A 09 08]
+        T01A = _mm_load_si128((__m128i*)&src[(i + 1) * nStride + 0]);    // [17 16 15 14 13 12 11 10]
+        T01B = _mm_load_si128((__m128i*)&src[(i + 1) * nStride + 8]);    // [1F 1E 1D 1C 1B 1A 19 18]
+        T02A = _mm_load_si128((__m128i*)&src[(i + 2) * nStride + 0]);    // [27 26 25 24 23 22 21 20]
+        T02B = _mm_load_si128((__m128i*)&src[(i + 2) * nStride + 8]);    // [2F 2E 2D 2C 2B 2A 29 28]
+        T03A = _mm_load_si128((__m128i*)&src[(i + 3) * nStride + 0]);    // [37 36 35 34 33 32 31 30]
+        T03B = _mm_load_si128((__m128i*)&src[(i + 3) * nStride + 8]);    // [3F 3E 3D 3C 3B 3A 39 38]
+        T04A = _mm_load_si128((__m128i*)&src[(i + 4) * nStride + 0]);    // [47 46 45 44 43 42 41 40]
+        T04B = _mm_load_si128((__m128i*)&src[(i + 4) * nStride + 8]);    // [4F 4E 4D 4C 4B 4A 49 48]
+        T05A = _mm_load_si128((__m128i*)&src[(i + 5) * nStride + 0]);    // [57 56 55 54 53 52 51 50]
+        T05B = _mm_load_si128((__m128i*)&src[(i + 5) * nStride + 8]);    // [5F 5E 5D 5C 5B 5A 59 58]
+        T06A = _mm_load_si128((__m128i*)&src[(i + 6) * nStride + 0]);    // [67 66 65 64 63 62 61 60]
+        T06B = _mm_load_si128((__m128i*)&src[(i + 6) * nStride + 8]);    // [6F 6E 6D 6C 6B 6A 69 68]
+        T07A = _mm_load_si128((__m128i*)&src[(i + 7) * nStride + 0]);    // [77 76 75 74 73 72 71 70]
+        T07B = _mm_load_si128((__m128i*)&src[(i + 7) * nStride + 8]);    // [7F 7E 7D 7C 7B 7A 79 78]
 
         T00B = _mm_shuffle_epi8(T00B, _mm_load_si128((__m128i*)tab_dct_16_0[0]));
         T01B = _mm_shuffle_epi8(T01B, _mm_load_si128((__m128i*)tab_dct_16_0[0]));
@@ -1017,7 +1017,7 @@
         T70  = _mm_packs_epi32(T60, T61);
         _mm_store_si128((__m128i*)&tmp[14 * 16 + i], T70);
 
-#define MAKE_ODD(tab, dst) \
+#define MAKE_ODD(tab, dstPos) \
     T60  = _mm_madd_epi16(T20, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \
     T61  = _mm_madd_epi16(T21, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \
     T62  = _mm_madd_epi16(T22, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \
@@ -1035,7 +1035,7 @@
     T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), 3); \
     T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), 3); \
     T70  = _mm_packs_epi32(T60, T61); \
-    _mm_store_si128((__m128i*)&tmp[(dst) * 16 + i], T70);
+    _mm_store_si128((__m128i*)&tmp[(dstPos) * 16 + i], T70);
 
         MAKE_ODD(0, 1);
         MAKE_ODD(1, 3);
@@ -1099,8 +1099,8 @@
         T41  = _mm_hsub_epi32(T30, T31);
         T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10);
         T41  = _mm_srai_epi32(_mm_add_epi32(T41, c_512), 10);
-        _mm_storeu_si128((__m128i*)&pDst[0 * 16 + i], T40);
-        _mm_storeu_si128((__m128i*)&pDst[8 * 16 + i], T41);
+        _mm_storeu_si128((__m128i*)&dst[0 * 16 + i], T40);
+        _mm_storeu_si128((__m128i*)&dst[8 * 16 + i], T41);
 
         T20  = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[8]));
         T21  = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[8]));
@@ -1121,7 +1121,7 @@
 
         T40  = _mm_hadd_epi32(T30, T31);
         T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10);
-        _mm_storeu_si128((__m128i*)&pDst[4 * 16 + i], T40);
+        _mm_storeu_si128((__m128i*)&dst[4 * 16 + i], T40);
 
         T20  = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[9]));
         T21  = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[9]));
@@ -1142,7 +1142,7 @@
 
         T40  = _mm_hadd_epi32(T30, T31);
         T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10);
-        _mm_storeu_si128((__m128i*)&pDst[12 * 16 + i], T40);
+        _mm_storeu_si128((__m128i*)&dst[12 * 16 + i], T40);
 
         T20  = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[10]));
         T21  = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[10]));
@@ -1163,7 +1163,7 @@
 
         T40  = _mm_hadd_epi32(T30, T31);
         T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10);
-        _mm_storeu_si128((__m128i*)&pDst[2 * 16 + i], T40);
+        _mm_storeu_si128((__m128i*)&dst[2 * 16 + i], T40);
 
         T20  = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[11]));
         T21  = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[11]));
@@ -1184,7 +1184,7 @@
 
         T40  = _mm_hadd_epi32(T30, T31);
         T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10);
-        _mm_storeu_si128((__m128i*)&pDst[6 * 16 + i], T40);
+        _mm_storeu_si128((__m128i*)&dst[6 * 16 + i], T40);
 
         T20  = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[12]));
         T21  = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[12]));
@@ -1205,7 +1205,7 @@
 
         T40  = _mm_hadd_epi32(T30, T31);
         T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10);
-        _mm_storeu_si128((__m128i*)&pDst[10 * 16 + i], T40);
+        _mm_storeu_si128((__m128i*)&dst[10 * 16 + i], T40);
 
         T20  = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[13]));
         T21  = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[13]));
@@ -1226,9 +1226,9 @@
 
         T40  = _mm_hadd_epi32(T30, T31);
         T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10);
-        _mm_storeu_si128((__m128i*)&pDst[14 * 16 + i], T40);
-
-#define MAKE_ODD(tab, dst) \
+        _mm_storeu_si128((__m128i*)&dst[14 * 16 + i], T40);
+
+#define MAKE_ODD(tab, dstPos) \
     T20  = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)]));       /* [*O2_0 *O1_0 *O3_0 *O0_0] */ \
     T21  = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[(tab) + 1]));   /* [*O5_0 *O6_0 *O4_0 *O7_0] */ \
     T22  = _mm_madd_epi16(T12, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \
@@ -1248,7 +1248,7 @@
         \
     T40  = _mm_hadd_epi32(T30, T31); \
     T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), 10); \
-    _mm_storeu_si128((__m128i*)&pDst[(dst) * 16 + i], T40);
+    _mm_storeu_si128((__m128i*)&dst[(dstPos) * 16 + i], T40);
 
         MAKE_ODD(14,  1);
         MAKE_ODD(16,  3);


More information about the x265-devel mailing list