[x265] [PATCH] dct: replace dequant vector class function with intrinsic

yuvaraj at multicorewareinc.com yuvaraj at multicorewareinc.com
Thu Oct 10 13:28:57 CEST 2013


# HG changeset patch
# User Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
# Date 1381404467 -19800
#      Thu Oct 10 16:57:47 2013 +0530
# Node ID 840229ed3794569f5e15d84289531c829b75dcd6
# Parent  a79ecf3a787577a2e557659c7a8d226d7d41ce00
dct: replace dequant vector class function with intrinsic

diff -r a79ecf3a7875 -r 840229ed3794 source/common/vec/dct-sse3.cpp
--- a/source/common/vec/dct-sse3.cpp	Thu Oct 10 12:29:41 2013 +0530
+++ b/source/common/vec/dct-sse3.cpp	Thu Oct 10 16:57:47 2013 +0530
@@ -62,68 +62,68 @@
         if (shift > per)
         {
             valueToAdd = 1 << (shift - per - 1);
-            Vec4i IAdd(valueToAdd);
+            __m128i IAdd = _mm_set1_epi32(valueToAdd);
 
             for (int n = 0; n < width * height; n = n + 8)
             {
-                Vec4i quantCoef1, quantCoef2, deQuantCoef1, deQuantCoef2;
+                __m128i quantCoef1, quantCoef2, deQuantCoef1, deQuantCoef2, quantCoef12, sign;
 
-                quantCoef1.load(quantCoef + n);
-                quantCoef2.load(quantCoef + n + 4);
+                quantCoef1 = _mm_loadu_si128((__m128i*)(quantCoef + n));
+                quantCoef2 = _mm_loadu_si128((__m128i*)(quantCoef + n + 4));
 
-                deQuantCoef1.load(deQuantCoef + n);
-                deQuantCoef2.load(deQuantCoef + n + 4);
+                deQuantCoef1 = _mm_loadu_si128((__m128i*)(deQuantCoef + n));
+                deQuantCoef2 = _mm_loadu_si128((__m128i*)(deQuantCoef + n + 4));
 
-                Vec8s quantCoef12 = compress_saturated(quantCoef1, quantCoef2);
+                quantCoef12 = _mm_packs_epi32(quantCoef1, quantCoef2);
+                sign = _mm_srai_epi16(quantCoef12, 15);
+                quantCoef1 = _mm_unpacklo_epi16(quantCoef12, sign);
+                quantCoef2 = _mm_unpackhi_epi16(quantCoef12, sign);
 
-                quantCoef1 = extend_low(quantCoef12);
-                quantCoef2 = extend_high(quantCoef12);
+                quantCoef1 = _mm_sra_epi32(_mm_add_epi32(_mm_mullo_epi32(quantCoef1, deQuantCoef1), IAdd), _mm_cvtsi32_si128(shift - per));
+                quantCoef2 = _mm_sra_epi32(_mm_add_epi32(_mm_mullo_epi32(quantCoef2, deQuantCoef2), IAdd), _mm_cvtsi32_si128(shift - per));
 
-                quantCoef1 =  (quantCoef1 *  deQuantCoef1 + IAdd) >> (shift - per);
-                quantCoef2 =  (quantCoef2 *  deQuantCoef2 + IAdd) >> (shift - per);
-
-                quantCoef12 = compress_saturated(quantCoef1, quantCoef2);
-
-                quantCoef1 = extend_low(quantCoef12);
-                quantCoef1.store(coef + n);
-                quantCoef2 = extend_high(quantCoef12);
-                quantCoef2.store(coef + n + 4);
+                quantCoef12 = _mm_packs_epi32(quantCoef1, quantCoef2);
+                sign = _mm_srai_epi16(quantCoef12, 15);
+                quantCoef1 = _mm_unpacklo_epi16(quantCoef12, sign);
+                _mm_storeu_si128((__m128i*)(coef + n), quantCoef1);
+                quantCoef2 = _mm_unpackhi_epi16(quantCoef12, sign);
+                _mm_storeu_si128((__m128i*)(coef + n + 4), quantCoef2);
             }
         }
         else
         {
             for (int n = 0; n < width * height; n = n + 8)
             {
-                Vec4i quantCoef1, quantCoef2, deQuantCoef1, deQuantCoef2;
+                __m128i quantCoef1, quantCoef2, deQuantCoef1, deQuantCoef2, quantCoef12, sign;
 
-                quantCoef1.load(quantCoef + n);
-                quantCoef2.load(quantCoef + n + 4);
+                quantCoef1 = _mm_loadu_si128((__m128i*)(quantCoef + n));
+                quantCoef2 = _mm_loadu_si128((__m128i*)(quantCoef + n + 4));
 
-                deQuantCoef1.load(deQuantCoef + n);
-                deQuantCoef2.load(deQuantCoef + n + 4);
+                deQuantCoef1 = _mm_loadu_si128((__m128i*)(deQuantCoef + n));
+                deQuantCoef2 = _mm_loadu_si128((__m128i*)(deQuantCoef + n + 4));
 
-                Vec8s quantCoef12 = compress_saturated(quantCoef1, quantCoef2);
+                quantCoef12 = _mm_packs_epi32(quantCoef1, quantCoef2);
+                sign = _mm_srai_epi16(quantCoef12, 15);
+                quantCoef1 = _mm_unpacklo_epi16(quantCoef12, sign);
+                quantCoef2 = _mm_unpackhi_epi16(quantCoef12, sign);
 
-                quantCoef1 = extend_low(quantCoef12);
-                quantCoef2 = extend_high(quantCoef12);
+                quantCoef1 = _mm_mullo_epi32(quantCoef1, deQuantCoef1);
+                quantCoef2 = _mm_mullo_epi32(quantCoef2, deQuantCoef2);
 
-                quantCoef1 = quantCoef1 * deQuantCoef1;
-                quantCoef2 = quantCoef2 * deQuantCoef2;
+                quantCoef12 = _mm_packs_epi32(quantCoef1, quantCoef2);
+                sign = _mm_srai_epi16(quantCoef12, 15);
+                quantCoef1 = _mm_unpacklo_epi16(quantCoef12, sign);
+                quantCoef2 = _mm_unpackhi_epi16(quantCoef12, sign);
 
-                quantCoef12 = compress_saturated(quantCoef1, quantCoef2);
+                quantCoef1 = _mm_sll_epi32(quantCoef1, _mm_cvtsi32_si128(per - shift));
+                quantCoef2 = _mm_sll_epi32(quantCoef2, _mm_cvtsi32_si128(per - shift));
 
-                quantCoef1 = extend_low(quantCoef12);
-                quantCoef2 = extend_high(quantCoef12);
-
-                quantCoef1 = quantCoef1 << (per - shift);
-                quantCoef2 = quantCoef2 << (per - shift);
-
-                quantCoef12 = compress_saturated(quantCoef1, quantCoef2);
-
-                quantCoef1 = extend_low(quantCoef12);
-                quantCoef1.store(coef + n);
-                quantCoef2 = extend_high(quantCoef12);
-                quantCoef2.store(coef + n + 4);
+                quantCoef12 = _mm_packs_epi32(quantCoef1, quantCoef2);
+                sign = _mm_srai_epi16(quantCoef12, 15);
+                quantCoef1 = _mm_unpacklo_epi16(quantCoef12, sign);
+                _mm_storeu_si128((__m128i*)(coef + n), quantCoef1);
+                quantCoef2 = _mm_unpackhi_epi16(quantCoef12, sign);
+                _mm_storeu_si128((__m128i*)(coef + n + 4), quantCoef2);
             }
         }
     }
@@ -132,29 +132,30 @@
         valueToAdd = 1 << (shift - 1);
         int scale = invQuantScales[rem] << per;
 
-        Vec4i vScale(scale);
-        Vec4i vAdd(valueToAdd);
+        __m128i vScale = _mm_set1_epi32(scale);
+        __m128i vAdd = _mm_set1_epi32(valueToAdd);
 
         for (int n = 0; n < width * height; n = n + 8)
         {
-            Vec4i quantCoef1, quantCoef2;
-            quantCoef1.load(quantCoef + n);
-            quantCoef2.load(quantCoef + n + 4);
+            __m128i quantCoef1, quantCoef2, quantCoef12, sign;
 
-            Vec8s quantCoef12 = compress_saturated(quantCoef1, quantCoef2);
+            quantCoef1 = _mm_loadu_si128((__m128i*)(quantCoef + n));
+            quantCoef2 = _mm_loadu_si128((__m128i*)(quantCoef + n + 4));
 
-            quantCoef1 = extend_low(quantCoef12);
-            quantCoef2 = extend_high(quantCoef12);
+            quantCoef12 = _mm_packs_epi32(quantCoef1, quantCoef2);
+            sign = _mm_srai_epi16(quantCoef12, 15);
+            quantCoef1 = _mm_unpacklo_epi16(quantCoef12, sign);
+            quantCoef2 = _mm_unpackhi_epi16(quantCoef12, sign);
 
-            quantCoef1 = (quantCoef1 * vScale + vAdd) >> shift;
-            quantCoef2 = (quantCoef2 * vScale + vAdd) >> shift;
+            quantCoef1 = _mm_sra_epi32(_mm_add_epi32(_mm_mullo_epi32(quantCoef1, vScale), vAdd), _mm_cvtsi32_si128(shift));
+            quantCoef2 = _mm_sra_epi32(_mm_add_epi32(_mm_mullo_epi32(quantCoef2, vScale), vAdd), _mm_cvtsi32_si128(shift));
 
-            quantCoef12 = compress_saturated(quantCoef1, quantCoef2);
-
-            quantCoef1 = extend_low(quantCoef12);
-            quantCoef1.store(coef + n);
-            quantCoef2 = extend_high(quantCoef12);
-            quantCoef2.store(coef + n + 4);
+            quantCoef12 = _mm_packs_epi32(quantCoef1, quantCoef2);
+            sign = _mm_srai_epi16(quantCoef12, 15);
+            quantCoef1 = _mm_unpacklo_epi16(quantCoef12, sign);
+            _mm_storeu_si128((__m128i*)(coef + n), quantCoef1);
+            quantCoef2 = _mm_unpackhi_epi16(quantCoef12, sign);
+            _mm_storeu_si128((__m128i*)(coef + n + 4), quantCoef2);
         }
     }
 }


More information about the x265-devel mailing list