[x265] [PATCH] xIDCT32 code cleanup

praveen at multicorewareinc.com praveen at multicorewareinc.com
Fri Jul 5 11:50:54 CEST 2013


# HG changeset patch
# User praveentiwari
# Date 1373017843 -19800
# Node ID e3e4ed95a58af4e69a7494b2ecddcec0e07b3513
# Parent  21934e9f52130a7dec6ea3f7a96ad99d023d61c3
xIDCT32 code cleanup

diff -r 21934e9f5213 -r e3e4ed95a58a source/common/vec/dct.inc
--- a/source/common/vec/dct.inc	Fri Jul 05 15:14:04 2013 +0530
+++ b/source/common/vec/dct.inc	Fri Jul 05 15:20:43 2013 +0530
@@ -3100,7 +3100,7 @@
     _mm_store_si128((__m128i*)&dst[15 * stride + 8], in15[1]);
 }
 
-void xIDCT32(int *pSrc, short *pDst, intptr_t stride)
+void xIDCT32(int *src, short *dst, intptr_t stride)
 {
     //Odd
     const __m128i c16_p90_p90   = _mm_set1_epi32(0x005A005A); //column 0
@@ -3282,7 +3282,7 @@
     const __m128i c16_p64_p64   = _mm_set1_epi32(0x00400040);
     __m128i c32_rnd             = _mm_set1_epi32(64);
 
-    Int nShift = 7;
+    int nShift = 7;
 
     // DCT1
     __m128i in00[4], in01[4], in02[4], in03[4], in04[4], in05[4], in06[4], in07[4], in08[4], in09[4], in10[4], in11[4], in12[4], in13[4], in14[4], in15[4];
@@ -3290,141 +3290,141 @@
     __m128i res00[4], res01[4], res02[4], res03[4], res04[4], res05[4], res06[4], res07[4], res08[4], res09[4], res10[4], res11[4], res12[4], res13[4], res14[4], res15[4];
     __m128i res16[4], res17[4], res18[4], res19[4], res20[4], res21[4], res22[4], res23[4], res24[4], res25[4], res26[4], res27[4], res28[4], res29[4], res30[4], res31[4];
 
-    for (Int i = 0; i < 4; i++)
+    for (int i = 0; i < 4; i++)
     {
         const int offset = (i << 3);
         __m128i T00, T01;
 
-        T00 = _mm_loadu_si128((const __m128i*)&pSrc[0 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&pSrc[0 * 32 + offset + 4]);
+        T00 = _mm_loadu_si128((const __m128i*)&src[0 * 32 + offset]);
+        T01 = _mm_loadu_si128((const __m128i*)&src[0 * 32 + offset + 4]);
         in00[i]  = _mm_packs_epi32(T00, T01);
 
-        T00 = _mm_loadu_si128((const __m128i*)&pSrc[1 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&pSrc[1 * 32 + offset + 4]);
+        T00 = _mm_loadu_si128((const __m128i*)&src[1 * 32 + offset]);
+        T01 = _mm_loadu_si128((const __m128i*)&src[1 * 32 + offset + 4]);
         in01[i]  = _mm_packs_epi32(T00, T01);
 
-        T00 = _mm_loadu_si128((const __m128i*)&pSrc[2 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&pSrc[2 * 32 + offset + 4]);
+        T00 = _mm_loadu_si128((const __m128i*)&src[2 * 32 + offset]);
+        T01 = _mm_loadu_si128((const __m128i*)&src[2 * 32 + offset + 4]);
         in02[i]  = _mm_packs_epi32(T00, T01);
 
-        T00 = _mm_loadu_si128((const __m128i*)&pSrc[3 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&pSrc[3 * 32 + offset + 4]);
+        T00 = _mm_loadu_si128((const __m128i*)&src[3 * 32 + offset]);
+        T01 = _mm_loadu_si128((const __m128i*)&src[3 * 32 + offset + 4]);
         in03[i]  = _mm_packs_epi32(T00, T01);
 
-        T00 = _mm_loadu_si128((const __m128i*)&pSrc[4 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&pSrc[4 * 32 + offset + 4]);
+        T00 = _mm_loadu_si128((const __m128i*)&src[4 * 32 + offset]);
+        T01 = _mm_loadu_si128((const __m128i*)&src[4 * 32 + offset + 4]);
         in04[i]  = _mm_packs_epi32(T00, T01);
 
-        T00 = _mm_loadu_si128((const __m128i*)&pSrc[5 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&pSrc[5 * 32 + offset + 4]);
+        T00 = _mm_loadu_si128((const __m128i*)&src[5 * 32 + offset]);
+        T01 = _mm_loadu_si128((const __m128i*)&src[5 * 32 + offset + 4]);
         in05[i]  = _mm_packs_epi32(T00, T01);
 
-        T00 = _mm_loadu_si128((const __m128i*)&pSrc[6 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&pSrc[6 * 32 + offset + 4]);
+        T00 = _mm_loadu_si128((const __m128i*)&src[6 * 32 + offset]);
+        T01 = _mm_loadu_si128((const __m128i*)&src[6 * 32 + offset + 4]);
         in06[i]  = _mm_packs_epi32(T00, T01);
 
-        T00 = _mm_loadu_si128((const __m128i*)&pSrc[7 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&pSrc[7 * 32 + offset + 4]);
+        T00 = _mm_loadu_si128((const __m128i*)&src[7 * 32 + offset]);
+        T01 = _mm_loadu_si128((const __m128i*)&src[7 * 32 + offset + 4]);
         in07[i]  = _mm_packs_epi32(T00, T01);
 
-        T00 = _mm_loadu_si128((const __m128i*)&pSrc[8 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&pSrc[8 * 32 + offset + 4]);
+        T00 = _mm_loadu_si128((const __m128i*)&src[8 * 32 + offset]);
+        T01 = _mm_loadu_si128((const __m128i*)&src[8 * 32 + offset + 4]);
         in08[i]  = _mm_packs_epi32(T00, T01);
 
-        T00 = _mm_loadu_si128((const __m128i*)&pSrc[9 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&pSrc[9 * 32 + offset + 4]);
+        T00 = _mm_loadu_si128((const __m128i*)&src[9 * 32 + offset]);
+        T01 = _mm_loadu_si128((const __m128i*)&src[9 * 32 + offset + 4]);
         in09[i]  = _mm_packs_epi32(T00, T01);
 
-        T00 = _mm_loadu_si128((const __m128i*)&pSrc[10 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&pSrc[10 * 32 + offset + 4]);
+        T00 = _mm_loadu_si128((const __m128i*)&src[10 * 32 + offset]);
+        T01 = _mm_loadu_si128((const __m128i*)&src[10 * 32 + offset + 4]);
         in10[i]  = _mm_packs_epi32(T00, T01);
 
-        T00 = _mm_loadu_si128((const __m128i*)&pSrc[11 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&pSrc[11 * 32 + offset + 4]);
+        T00 = _mm_loadu_si128((const __m128i*)&src[11 * 32 + offset]);
+        T01 = _mm_loadu_si128((const __m128i*)&src[11 * 32 + offset + 4]);
         in11[i]  = _mm_packs_epi32(T00, T01);
 
-        T00 = _mm_loadu_si128((const __m128i*)&pSrc[12 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&pSrc[12 * 32 + offset + 4]);
+        T00 = _mm_loadu_si128((const __m128i*)&src[12 * 32 + offset]);
+        T01 = _mm_loadu_si128((const __m128i*)&src[12 * 32 + offset + 4]);
         in12[i]  = _mm_packs_epi32(T00, T01);
 
-        T00 = _mm_loadu_si128((const __m128i*)&pSrc[13 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&pSrc[13 * 32 + offset + 4]);
+        T00 = _mm_loadu_si128((const __m128i*)&src[13 * 32 + offset]);
+        T01 = _mm_loadu_si128((const __m128i*)&src[13 * 32 + offset + 4]);
         in13[i]  = _mm_packs_epi32(T00, T01);
 
-        T00 = _mm_loadu_si128((const __m128i*)&pSrc[14 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&pSrc[14 * 32 + offset + 4]);
+        T00 = _mm_loadu_si128((const __m128i*)&src[14 * 32 + offset]);
+        T01 = _mm_loadu_si128((const __m128i*)&src[14 * 32 + offset + 4]);
         in14[i]  = _mm_packs_epi32(T00, T01);
 
-        T00 = _mm_loadu_si128((const __m128i*)&pSrc[15 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&pSrc[15 * 32 + offset + 4]);
+        T00 = _mm_loadu_si128((const __m128i*)&src[15 * 32 + offset]);
+        T01 = _mm_loadu_si128((const __m128i*)&src[15 * 32 + offset + 4]);
         in15[i]  = _mm_packs_epi32(T00, T01);
 
-        T00 = _mm_loadu_si128((const __m128i*)&pSrc[16 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&pSrc[16 * 32 + offset + 4]);
+        T00 = _mm_loadu_si128((const __m128i*)&src[16 * 32 + offset]);
+        T01 = _mm_loadu_si128((const __m128i*)&src[16 * 32 + offset + 4]);
         in16[i]  = _mm_packs_epi32(T00, T01);
 
-        T00 = _mm_loadu_si128((const __m128i*)&pSrc[17 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&pSrc[17 * 32 + offset + 4]);
+        T00 = _mm_loadu_si128((const __m128i*)&src[17 * 32 + offset]);
+        T01 = _mm_loadu_si128((const __m128i*)&src[17 * 32 + offset + 4]);
         in17[i]  = _mm_packs_epi32(T00, T01);
 
-        T00 = _mm_loadu_si128((const __m128i*)&pSrc[18 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&pSrc[18 * 32 + offset + 4]);
+        T00 = _mm_loadu_si128((const __m128i*)&src[18 * 32 + offset]);
+        T01 = _mm_loadu_si128((const __m128i*)&src[18 * 32 + offset + 4]);
         in18[i]  = _mm_packs_epi32(T00, T01);
 
-        T00 = _mm_loadu_si128((const __m128i*)&pSrc[19 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&pSrc[19 * 32 + offset + 4]);
+        T00 = _mm_loadu_si128((const __m128i*)&src[19 * 32 + offset]);
+        T01 = _mm_loadu_si128((const __m128i*)&src[19 * 32 + offset + 4]);
         in19[i]  = _mm_packs_epi32(T00, T01);
 
-        T00 = _mm_loadu_si128((const __m128i*)&pSrc[20 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&pSrc[20 * 32 + offset + 4]);
+        T00 = _mm_loadu_si128((const __m128i*)&src[20 * 32 + offset]);
+        T01 = _mm_loadu_si128((const __m128i*)&src[20 * 32 + offset + 4]);
         in20[i]  = _mm_packs_epi32(T00, T01);
 
-        T00 = _mm_loadu_si128((const __m128i*)&pSrc[21 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&pSrc[21 * 32 + offset + 4]);
+        T00 = _mm_loadu_si128((const __m128i*)&src[21 * 32 + offset]);
+        T01 = _mm_loadu_si128((const __m128i*)&src[21 * 32 + offset + 4]);
         in21[i]  = _mm_packs_epi32(T00, T01);
 
-        T00 = _mm_loadu_si128((const __m128i*)&pSrc[22 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&pSrc[22 * 32 + offset + 4]);
+        T00 = _mm_loadu_si128((const __m128i*)&src[22 * 32 + offset]);
+        T01 = _mm_loadu_si128((const __m128i*)&src[22 * 32 + offset + 4]);
         in22[i]  = _mm_packs_epi32(T00, T01);
 
-        T00 = _mm_loadu_si128((const __m128i*)&pSrc[23 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&pSrc[23 * 32 + offset + 4]);
+        T00 = _mm_loadu_si128((const __m128i*)&src[23 * 32 + offset]);
+        T01 = _mm_loadu_si128((const __m128i*)&src[23 * 32 + offset + 4]);
         in23[i]  = _mm_packs_epi32(T00, T01);
 
-        T00 = _mm_loadu_si128((const __m128i*)&pSrc[24 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&pSrc[24 * 32 + offset + 4]);
+        T00 = _mm_loadu_si128((const __m128i*)&src[24 * 32 + offset]);
+        T01 = _mm_loadu_si128((const __m128i*)&src[24 * 32 + offset + 4]);
         in24[i]  = _mm_packs_epi32(T00, T01);
 
-        T00 = _mm_loadu_si128((const __m128i*)&pSrc[25 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&pSrc[25 * 32 + offset + 4]);
+        T00 = _mm_loadu_si128((const __m128i*)&src[25 * 32 + offset]);
+        T01 = _mm_loadu_si128((const __m128i*)&src[25 * 32 + offset + 4]);
         in25[i]  = _mm_packs_epi32(T00, T01);
 
-        T00 = _mm_loadu_si128((const __m128i*)&pSrc[26 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&pSrc[26 * 32 + offset + 4]);
+        T00 = _mm_loadu_si128((const __m128i*)&src[26 * 32 + offset]);
+        T01 = _mm_loadu_si128((const __m128i*)&src[26 * 32 + offset + 4]);
         in26[i]  = _mm_packs_epi32(T00, T01);
 
-        T00 = _mm_loadu_si128((const __m128i*)&pSrc[27 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&pSrc[27 * 32 + offset + 4]);
+        T00 = _mm_loadu_si128((const __m128i*)&src[27 * 32 + offset]);
+        T01 = _mm_loadu_si128((const __m128i*)&src[27 * 32 + offset + 4]);
         in27[i]  = _mm_packs_epi32(T00, T01);
 
-        T00 = _mm_loadu_si128((const __m128i*)&pSrc[28 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&pSrc[28 * 32 + offset + 4]);
+        T00 = _mm_loadu_si128((const __m128i*)&src[28 * 32 + offset]);
+        T01 = _mm_loadu_si128((const __m128i*)&src[28 * 32 + offset + 4]);
         in28[i]  = _mm_packs_epi32(T00, T01);
 
-        T00 = _mm_loadu_si128((const __m128i*)&pSrc[29 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&pSrc[29 * 32 + offset + 4]);
+        T00 = _mm_loadu_si128((const __m128i*)&src[29 * 32 + offset]);
+        T01 = _mm_loadu_si128((const __m128i*)&src[29 * 32 + offset + 4]);
         in29[i]  = _mm_packs_epi32(T00, T01);
 
-        T00 = _mm_loadu_si128((const __m128i*)&pSrc[30 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&pSrc[30 * 32 + offset + 4]);
+        T00 = _mm_loadu_si128((const __m128i*)&src[30 * 32 + offset]);
+        T01 = _mm_loadu_si128((const __m128i*)&src[30 * 32 + offset + 4]);
         in30[i]  = _mm_packs_epi32(T00, T01);
 
-        T00 = _mm_loadu_si128((const __m128i*)&pSrc[31 * 32 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&pSrc[31 * 32 + offset + 4]);
+        T00 = _mm_loadu_si128((const __m128i*)&src[31 * 32 + offset]);
+        T01 = _mm_loadu_si128((const __m128i*)&src[31 * 32 + offset + 4]);
         in31[i]  = _mm_packs_epi32(T00, T01);
     }
 
-    for (Int pass = 0; pass < 2; pass++)
+    for (int pass = 0; pass < 2; pass++)
     {
         if (pass == 1)
         {
@@ -3432,7 +3432,7 @@
             nShift  = 12;
         }
 
-        for (Int part = 0; part < 4; part++)
+        for (int part = 0; part < 4; part++)
         {
             const __m128i T_00_00A = _mm_unpacklo_epi16(in01[part], in03[part]);       // [33 13 32 12 31 11 30 10]
             const __m128i T_00_00B = _mm_unpackhi_epi16(in01[part], in03[part]);       // [37 17 36 16 35 15 34 14]
@@ -3909,25 +3909,25 @@
     }
 
     // Add
-    for (Int i = 0; i < 2; i++)
+    for (int i = 0; i < 2; i++)
     {
 #define STROE_LINE(L0, L1, L2, L3, L4, L5, L6, L7, H0, H1, H2, H3, H4, H5, H6, H7, offsetV, offsetH) \
-    _mm_storeu_si128((__m128i*)&pDst[(0 + (offsetV)) * stride + (offsetH) + 0], L0); \
-    _mm_storeu_si128((__m128i*)&pDst[(0 + (offsetV)) * stride + (offsetH) + 8], H0); \
-    _mm_storeu_si128((__m128i*)&pDst[(1 + (offsetV)) * stride + (offsetH) + 0], L1); \
-    _mm_storeu_si128((__m128i*)&pDst[(1 + (offsetV)) * stride + (offsetH) + 8], H1); \
-    _mm_storeu_si128((__m128i*)&pDst[(2 + (offsetV)) * stride + (offsetH) + 0], L2); \
-    _mm_storeu_si128((__m128i*)&pDst[(2 + (offsetV)) * stride + (offsetH) + 8], H2); \
-    _mm_storeu_si128((__m128i*)&pDst[(3 + (offsetV)) * stride + (offsetH) + 0], L3); \
-    _mm_storeu_si128((__m128i*)&pDst[(3 + (offsetV)) * stride + (offsetH) + 8], H3); \
-    _mm_storeu_si128((__m128i*)&pDst[(4 + (offsetV)) * stride + (offsetH) + 0], L4); \
-    _mm_storeu_si128((__m128i*)&pDst[(4 + (offsetV)) * stride + (offsetH) + 8], H4); \
-    _mm_storeu_si128((__m128i*)&pDst[(5 + (offsetV)) * stride + (offsetH) + 0], L5); \
-    _mm_storeu_si128((__m128i*)&pDst[(5 + (offsetV)) * stride + (offsetH) + 8], H5); \
-    _mm_storeu_si128((__m128i*)&pDst[(6 + (offsetV)) * stride + (offsetH) + 0], L6); \
-    _mm_storeu_si128((__m128i*)&pDst[(6 + (offsetV)) * stride + (offsetH) + 8], H6); \
-    _mm_storeu_si128((__m128i*)&pDst[(7 + (offsetV)) * stride + (offsetH) + 0], L7); \
-    _mm_storeu_si128((__m128i*)&pDst[(7 + (offsetV)) * stride + (offsetH) + 8], H7);
+    _mm_storeu_si128((__m128i*)&dst[(0 + (offsetV)) * stride + (offsetH) + 0], L0); \
+    _mm_storeu_si128((__m128i*)&dst[(0 + (offsetV)) * stride + (offsetH) + 8], H0); \
+    _mm_storeu_si128((__m128i*)&dst[(1 + (offsetV)) * stride + (offsetH) + 0], L1); \
+    _mm_storeu_si128((__m128i*)&dst[(1 + (offsetV)) * stride + (offsetH) + 8], H1); \
+    _mm_storeu_si128((__m128i*)&dst[(2 + (offsetV)) * stride + (offsetH) + 0], L2); \
+    _mm_storeu_si128((__m128i*)&dst[(2 + (offsetV)) * stride + (offsetH) + 8], H2); \
+    _mm_storeu_si128((__m128i*)&dst[(3 + (offsetV)) * stride + (offsetH) + 0], L3); \
+    _mm_storeu_si128((__m128i*)&dst[(3 + (offsetV)) * stride + (offsetH) + 8], H3); \
+    _mm_storeu_si128((__m128i*)&dst[(4 + (offsetV)) * stride + (offsetH) + 0], L4); \
+    _mm_storeu_si128((__m128i*)&dst[(4 + (offsetV)) * stride + (offsetH) + 8], H4); \
+    _mm_storeu_si128((__m128i*)&dst[(5 + (offsetV)) * stride + (offsetH) + 0], L5); \
+    _mm_storeu_si128((__m128i*)&dst[(5 + (offsetV)) * stride + (offsetH) + 8], H5); \
+    _mm_storeu_si128((__m128i*)&dst[(6 + (offsetV)) * stride + (offsetH) + 0], L6); \
+    _mm_storeu_si128((__m128i*)&dst[(6 + (offsetV)) * stride + (offsetH) + 8], H6); \
+    _mm_storeu_si128((__m128i*)&dst[(7 + (offsetV)) * stride + (offsetH) + 0], L7); \
+    _mm_storeu_si128((__m128i*)&dst[(7 + (offsetV)) * stride + (offsetH) + 8], H7);
 
         const int k = i * 2;
         STROE_LINE(in00[k], in01[k], in02[k], in03[k], in04[k], in05[k], in06[k], in07[k], in00[k + 1], in01[k + 1], in02[k + 1], in03[k + 1], in04[k + 1], in05[k + 1], in06[k + 1], in07[k + 1], 0, i * 16)


More information about the x265-devel mailing list