[x265] [PATCH] xIDCT16 code cleanup

praveen at multicorewareinc.com praveen at multicorewareinc.com
Fri Jul 5 11:44:14 CEST 2013


# HG changeset patch
# User praveentiwari
# Date 1373017444 -19800
# Node ID 21934e9f52130a7dec6ea3f7a96ad99d023d61c3
# Parent  bf00c76a537d61ff3c4460ed750665d9898989c6
xIDCT16 code cleanup

diff -r bf00c76a537d -r 21934e9f5213 source/common/vec/dct.inc
--- a/source/common/vec/dct.inc	Thu Jul 04 14:30:37 2013 +0530
+++ b/source/common/vec/dct.inc	Fri Jul 05 15:14:04 2013 +0530
@@ -2689,7 +2689,7 @@
     _mm_storeh_pi((__m64*)&dst[7 * stride +  4], _mm_castsi128_ps(T11));
 }
 
-void xIDCT16(int *pSrc, short *pDst, intptr_t stride)
+void xIDCT16(int *src, short *dst, intptr_t stride)
 {
     const __m128i c16_p87_p90   = _mm_set1_epi32(0x0057005A); //row0 87high - 90low address
     const __m128i c16_p70_p80   = _mm_set1_epi32(0x00460050);
@@ -2753,72 +2753,72 @@
         const int offset = (i << 3);
         __m128i T00, T01;
 
-        T00 = _mm_loadu_si128((const __m128i*)&pSrc[0 * 16 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&pSrc[0 * 16 + offset + 4]);
+        T00 = _mm_loadu_si128((const __m128i*)&src[0 * 16 + offset]);
+        T01 = _mm_loadu_si128((const __m128i*)&src[0 * 16 + offset + 4]);
         in00[i]  = _mm_packs_epi32(T00, T01);                       // [07 06 05 04 03 02 01 00]
 
-        T00 = _mm_loadu_si128((const __m128i*)&pSrc[1 * 16 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&pSrc[1 * 16 + offset + 4]);
+        T00 = _mm_loadu_si128((const __m128i*)&src[1 * 16 + offset]);
+        T01 = _mm_loadu_si128((const __m128i*)&src[1 * 16 + offset + 4]);
         in01[i]  = _mm_packs_epi32(T00, T01);                           // [17 16 15 14 13 12 11 10]
 
-        T00 = _mm_loadu_si128((const __m128i*)&pSrc[2 * 16 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&pSrc[2 * 16 + offset + 4]);
+        T00 = _mm_loadu_si128((const __m128i*)&src[2 * 16 + offset]);
+        T01 = _mm_loadu_si128((const __m128i*)&src[2 * 16 + offset + 4]);
         in02[i]  = _mm_packs_epi32(T00, T01);                       // [27 26 25 24 23 22 21 20]
 
-        T00 = _mm_loadu_si128((const __m128i*)&pSrc[3 * 16 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&pSrc[3 * 16 + offset + 4]);
+        T00 = _mm_loadu_si128((const __m128i*)&src[3 * 16 + offset]);
+        T01 = _mm_loadu_si128((const __m128i*)&src[3 * 16 + offset + 4]);
         in03[i]  = _mm_packs_epi32(T00, T01);                       // [37 36 35 34 33 32 31 30]
 
-        T00 = _mm_loadu_si128((const __m128i*)&pSrc[4 * 16 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&pSrc[4 * 16 + offset + 4]);
+        T00 = _mm_loadu_si128((const __m128i*)&src[4 * 16 + offset]);
+        T01 = _mm_loadu_si128((const __m128i*)&src[4 * 16 + offset + 4]);
         in04[i]  = _mm_packs_epi32(T00, T01);                       // [47 46 45 44 43 42 41 40]
 
-        T00 = _mm_loadu_si128((const __m128i*)&pSrc[5 * 16 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&pSrc[5 * 16 + offset + 4]);
+        T00 = _mm_loadu_si128((const __m128i*)&src[5 * 16 + offset]);
+        T01 = _mm_loadu_si128((const __m128i*)&src[5 * 16 + offset + 4]);
         in05[i]  = _mm_packs_epi32(T00, T01);                       // [57 56 55 54 53 52 51 50]
 
-        T00 = _mm_loadu_si128((const __m128i*)&pSrc[6 * 16 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&pSrc[6 * 16 + offset + 4]);
+        T00 = _mm_loadu_si128((const __m128i*)&src[6 * 16 + offset]);
+        T01 = _mm_loadu_si128((const __m128i*)&src[6 * 16 + offset + 4]);
         in06[i]  = _mm_packs_epi32(T00, T01);                       // [67 66 65 64 63 62 61 60]
 
-        T00 = _mm_loadu_si128((const __m128i*)&pSrc[7 * 16 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&pSrc[7 * 16 + offset + 4]);
+        T00 = _mm_loadu_si128((const __m128i*)&src[7 * 16 + offset]);
+        T01 = _mm_loadu_si128((const __m128i*)&src[7 * 16 + offset + 4]);
         in07[i]  = _mm_packs_epi32(T00, T01);                       // [77 76 75 74 73 72 71 70]
 
-        T00 = _mm_loadu_si128((const __m128i*)&pSrc[8 * 16 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&pSrc[8 * 16 + offset + 4]);
+        T00 = _mm_loadu_si128((const __m128i*)&src[8 * 16 + offset]);
+        T01 = _mm_loadu_si128((const __m128i*)&src[8 * 16 + offset + 4]);
         in08[i]  = _mm_packs_epi32(T00, T01);
 
-        T00 = _mm_loadu_si128((const __m128i*)&pSrc[9 * 16 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&pSrc[9 * 16 + offset + 4]);
+        T00 = _mm_loadu_si128((const __m128i*)&src[9 * 16 + offset]);
+        T01 = _mm_loadu_si128((const __m128i*)&src[9 * 16 + offset + 4]);
         in09[i]  = _mm_packs_epi32(T00, T01);
 
-        T00 = _mm_loadu_si128((const __m128i*)&pSrc[10 * 16 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&pSrc[10 * 16 + offset + 4]);
+        T00 = _mm_loadu_si128((const __m128i*)&src[10 * 16 + offset]);
+        T01 = _mm_loadu_si128((const __m128i*)&src[10 * 16 + offset + 4]);
         in10[i]  = _mm_packs_epi32(T00, T01);
 
-        T00 = _mm_loadu_si128((const __m128i*)&pSrc[11 * 16 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&pSrc[11 * 16 + offset + 4]);
+        T00 = _mm_loadu_si128((const __m128i*)&src[11 * 16 + offset]);
+        T01 = _mm_loadu_si128((const __m128i*)&src[11 * 16 + offset + 4]);
         in11[i]  = _mm_packs_epi32(T00, T01);
 
-        T00 = _mm_loadu_si128((const __m128i*)&pSrc[12 * 16 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&pSrc[12 * 16 + offset + 4]);
+        T00 = _mm_loadu_si128((const __m128i*)&src[12 * 16 + offset]);
+        T01 = _mm_loadu_si128((const __m128i*)&src[12 * 16 + offset + 4]);
         in12[i]  = _mm_packs_epi32(T00, T01);
 
-        T00 = _mm_loadu_si128((const __m128i*)&pSrc[13 * 16 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&pSrc[13 * 16 + offset + 4]);
+        T00 = _mm_loadu_si128((const __m128i*)&src[13 * 16 + offset]);
+        T01 = _mm_loadu_si128((const __m128i*)&src[13 * 16 + offset + 4]);
         in13[i]  = _mm_packs_epi32(T00, T01);
 
-        T00 = _mm_loadu_si128((const __m128i*)&pSrc[14 * 16 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&pSrc[14 * 16 + offset + 4]);
+        T00 = _mm_loadu_si128((const __m128i*)&src[14 * 16 + offset]);
+        T01 = _mm_loadu_si128((const __m128i*)&src[14 * 16 + offset + 4]);
         in14[i]  = _mm_packs_epi32(T00, T01);
 
-        T00 = _mm_loadu_si128((const __m128i*)&pSrc[15 * 16 + offset]);
-        T01 = _mm_loadu_si128((const __m128i*)&pSrc[15 * 16 + offset + 4]);
+        T00 = _mm_loadu_si128((const __m128i*)&src[15 * 16 + offset]);
+        T01 = _mm_loadu_si128((const __m128i*)&src[15 * 16 + offset + 4]);
         in15[i]  = _mm_packs_epi32(T00, T01);
     }
 
-    for (Int pass = 0; pass < 2; pass++)
+    for (int pass = 0; pass < 2; pass++)
     {
         if (pass == 1)
         {
@@ -2826,7 +2826,7 @@
             nShift  = 12;
         }
 
-        for (Int part = 0; part < 2; part++)
+        for (int part = 0; part < 2; part++)
         {
             const __m128i T_00_00A = _mm_unpacklo_epi16(in01[part], in03[part]);       // [33 13 32 12 31 11 30 10]
             const __m128i T_00_00B = _mm_unpackhi_epi16(in01[part], in03[part]);       // [37 17 36 16 35 15 34 14]
@@ -3066,38 +3066,38 @@
         }
     }
 
-    _mm_store_si128((__m128i*)&pDst[0 * stride + 0], in00[0]);
-    _mm_store_si128((__m128i*)&pDst[0 * stride + 8], in00[1]);
-    _mm_store_si128((__m128i*)&pDst[1 * stride + 0], in01[0]);
-    _mm_store_si128((__m128i*)&pDst[1 * stride + 8], in01[1]);
-    _mm_store_si128((__m128i*)&pDst[2 * stride + 0], in02[0]);
-    _mm_store_si128((__m128i*)&pDst[2 * stride + 8], in02[1]);
-    _mm_store_si128((__m128i*)&pDst[3 * stride + 0], in03[0]);
-    _mm_store_si128((__m128i*)&pDst[3 * stride + 8], in03[1]);
-    _mm_store_si128((__m128i*)&pDst[4 * stride + 0], in04[0]);
-    _mm_store_si128((__m128i*)&pDst[4 * stride + 8], in04[1]);
-    _mm_store_si128((__m128i*)&pDst[5 * stride + 0], in05[0]);
-    _mm_store_si128((__m128i*)&pDst[5 * stride + 8], in05[1]);
-    _mm_store_si128((__m128i*)&pDst[6 * stride + 0], in06[0]);
-    _mm_store_si128((__m128i*)&pDst[6 * stride + 8], in06[1]);
-    _mm_store_si128((__m128i*)&pDst[7 * stride + 0], in07[0]);
-    _mm_store_si128((__m128i*)&pDst[7 * stride + 8], in07[1]);
-    _mm_store_si128((__m128i*)&pDst[8 * stride + 0], in08[0]);
-    _mm_store_si128((__m128i*)&pDst[8 * stride + 8], in08[1]);
-    _mm_store_si128((__m128i*)&pDst[9 * stride + 0], in09[0]);
-    _mm_store_si128((__m128i*)&pDst[9 * stride + 8], in09[1]);
-    _mm_store_si128((__m128i*)&pDst[10 * stride + 0], in10[0]);
-    _mm_store_si128((__m128i*)&pDst[10 * stride + 8], in10[1]);
-    _mm_store_si128((__m128i*)&pDst[11 * stride + 0], in11[0]);
-    _mm_store_si128((__m128i*)&pDst[11 * stride + 8], in11[1]);
-    _mm_store_si128((__m128i*)&pDst[12 * stride + 0], in12[0]);
-    _mm_store_si128((__m128i*)&pDst[12 * stride + 8], in12[1]);
-    _mm_store_si128((__m128i*)&pDst[13 * stride + 0], in13[0]);
-    _mm_store_si128((__m128i*)&pDst[13 * stride + 8], in13[1]);
-    _mm_store_si128((__m128i*)&pDst[14 * stride + 0], in14[0]);
-    _mm_store_si128((__m128i*)&pDst[14 * stride + 8], in14[1]);
-    _mm_store_si128((__m128i*)&pDst[15 * stride + 0], in15[0]);
-    _mm_store_si128((__m128i*)&pDst[15 * stride + 8], in15[1]);
+    _mm_store_si128((__m128i*)&dst[0 * stride + 0], in00[0]);
+    _mm_store_si128((__m128i*)&dst[0 * stride + 8], in00[1]);
+    _mm_store_si128((__m128i*)&dst[1 * stride + 0], in01[0]);
+    _mm_store_si128((__m128i*)&dst[1 * stride + 8], in01[1]);
+    _mm_store_si128((__m128i*)&dst[2 * stride + 0], in02[0]);
+    _mm_store_si128((__m128i*)&dst[2 * stride + 8], in02[1]);
+    _mm_store_si128((__m128i*)&dst[3 * stride + 0], in03[0]);
+    _mm_store_si128((__m128i*)&dst[3 * stride + 8], in03[1]);
+    _mm_store_si128((__m128i*)&dst[4 * stride + 0], in04[0]);
+    _mm_store_si128((__m128i*)&dst[4 * stride + 8], in04[1]);
+    _mm_store_si128((__m128i*)&dst[5 * stride + 0], in05[0]);
+    _mm_store_si128((__m128i*)&dst[5 * stride + 8], in05[1]);
+    _mm_store_si128((__m128i*)&dst[6 * stride + 0], in06[0]);
+    _mm_store_si128((__m128i*)&dst[6 * stride + 8], in06[1]);
+    _mm_store_si128((__m128i*)&dst[7 * stride + 0], in07[0]);
+    _mm_store_si128((__m128i*)&dst[7 * stride + 8], in07[1]);
+    _mm_store_si128((__m128i*)&dst[8 * stride + 0], in08[0]);
+    _mm_store_si128((__m128i*)&dst[8 * stride + 8], in08[1]);
+    _mm_store_si128((__m128i*)&dst[9 * stride + 0], in09[0]);
+    _mm_store_si128((__m128i*)&dst[9 * stride + 8], in09[1]);
+    _mm_store_si128((__m128i*)&dst[10 * stride + 0], in10[0]);
+    _mm_store_si128((__m128i*)&dst[10 * stride + 8], in10[1]);
+    _mm_store_si128((__m128i*)&dst[11 * stride + 0], in11[0]);
+    _mm_store_si128((__m128i*)&dst[11 * stride + 8], in11[1]);
+    _mm_store_si128((__m128i*)&dst[12 * stride + 0], in12[0]);
+    _mm_store_si128((__m128i*)&dst[12 * stride + 8], in12[1]);
+    _mm_store_si128((__m128i*)&dst[13 * stride + 0], in13[0]);
+    _mm_store_si128((__m128i*)&dst[13 * stride + 8], in13[1]);
+    _mm_store_si128((__m128i*)&dst[14 * stride + 0], in14[0]);
+    _mm_store_si128((__m128i*)&dst[14 * stride + 8], in14[1]);
+    _mm_store_si128((__m128i*)&dst[15 * stride + 0], in15[0]);
+    _mm_store_si128((__m128i*)&dst[15 * stride + 8], in15[1]);
 }
 
 void xIDCT32(int *pSrc, short *pDst, intptr_t stride)


More information about the x265-devel mailing list