[x265] [PATCH] dequant_scaling optimization, downscaling quantCoef from int32_t* to int16_t*

praveen at multicorewareinc.com praveen at multicorewareinc.com
Tue Sep 2 16:09:16 CEST 2014


# HG changeset patch
# User Praveen Tiwari
# Date 1409638087 -19800
# Node ID babeba30ab42c98b5ec124dc5e04567e9a4aa775
# Parent  71b094ee56aaa0adb6e25789d289844c0820f062
dequant_scaling optimization, downscaling quantCoef from int32_t* to int16_t*

diff -r 71b094ee56aa -r babeba30ab42 source/common/dct.cpp
--- a/source/common/dct.cpp	Tue Sep 02 11:23:11 2014 +0530
+++ b/source/common/dct.cpp	Tue Sep 02 11:38:07 2014 +0530
@@ -741,12 +741,11 @@
     }
 }
 
-void dequant_scaling_c(const int32_t* quantCoef, const int32_t *deQuantCoef, int32_t* coef, int num, int per, int shift)
+void dequant_scaling_c(const int16_t* quantCoef, const int32_t *deQuantCoef, int32_t* coef, int num, int per, int shift)
 {
     X265_CHECK(num <= 32 * 32, "dequant num %d too large\n", num);
 
     int add, coeffQ;
-    int clipQCoef;
 
     shift += 4;
 
@@ -756,8 +755,7 @@
 
         for (int n = 0; n < num; n++)
         {
-            clipQCoef = Clip3(-32768, 32767, quantCoef[n]);
-            coeffQ = ((clipQCoef * deQuantCoef[n]) + add) >> (shift - per);
+            coeffQ = ((quantCoef[n] * deQuantCoef[n]) + add) >> (shift - per);
             coef[n] = Clip3(-32768, 32767, coeffQ);
         }
     }
@@ -765,8 +763,7 @@
     {
         for (int n = 0; n < num; n++)
         {
-            clipQCoef = Clip3(-32768, 32767, quantCoef[n]);
-            coeffQ   = Clip3(-32768, 32767, clipQCoef * deQuantCoef[n]);
+            coeffQ   = Clip3(-32768, 32767, quantCoef[n] * deQuantCoef[n]);
             coef[n] = Clip3(-32768, 32767, coeffQ << (per - shift));
         }
     }
diff -r 71b094ee56aa -r babeba30ab42 source/common/primitives.h
--- a/source/common/primitives.h	Tue Sep 02 11:23:11 2014 +0530
+++ b/source/common/primitives.h	Tue Sep 02 11:38:07 2014 +0530
@@ -164,7 +164,7 @@
 typedef void (*transpose_t)(pixel* dst, pixel* src, intptr_t stride);
 typedef uint32_t (*quant_t)(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff);
 typedef uint32_t (*nquant_t)(int32_t *coef, int32_t *quantCoeff, int32_t *qCoef, int qBits, int add, int numCoeff);
-typedef void (*dequant_scaling_t)(const int32_t* src, const int32_t *dequantCoef, int32_t* dst, int num, int mcqp_miper, int shift);
+typedef void (*dequant_scaling_t)(const int16_t* src, const int32_t *dequantCoef, int32_t* dst, int num, int mcqp_miper, int shift);
 typedef void (*dequant_normal_t)(const int16_t* quantCoef, int32_t* coef, int num, int scale, int shift);
 typedef int  (*count_nonzero_t)(const int16_t *quantCoeff, int numCoeff);
 
diff -r 71b094ee56aa -r babeba30ab42 source/common/quant.cpp
--- a/source/common/quant.cpp	Tue Sep 02 11:23:11 2014 +0530
+++ b/source/common/quant.cpp	Tue Sep 02 11:38:07 2014 +0530
@@ -423,7 +423,7 @@
     {
         int scalingListType = (bIntra ? 0 : 3) + ttype;
         int32_t *dequantCoef = m_scalingList->m_dequantCoef[log2TrSize - 2][scalingListType][rem];
-        primitives.dequant_scaling(coeff, dequantCoef, m_resiDctCoeff, numCoeff, per, shift);
+        primitives.dequant_scaling(qCoeff, dequantCoef, m_resiDctCoeff, numCoeff, per, shift);
     }
     else
     {
diff -r 71b094ee56aa -r babeba30ab42 source/common/vec/dct-sse41.cpp
--- a/source/common/vec/dct-sse41.cpp	Tue Sep 02 11:23:11 2014 +0530
+++ b/source/common/vec/dct-sse41.cpp	Tue Sep 02 11:38:07 2014 +0530
@@ -36,7 +36,7 @@
 using namespace x265;
 
 namespace {
-void dequant_scaling(const int32_t* quantCoef, const int32_t *deQuantCoef, int32_t* coef, int num, int per, int shift)
+void dequant_scaling(const int16_t* quantCoef, const int32_t *deQuantCoef, int32_t* coef, int num, int per, int shift)
 {
     X265_CHECK(num <= 32 * 32, "dequant num too large\n");
 
@@ -53,13 +53,11 @@
         {
             __m128i quantCoef1, quantCoef2, deQuantCoef1, deQuantCoef2, quantCoef12, sign;
 
-            quantCoef1 = _mm_loadu_si128((__m128i*)(quantCoef + n));
-            quantCoef2 = _mm_loadu_si128((__m128i*)(quantCoef + n + 4));
+            quantCoef12 = _mm_loadu_si128((__m128i*)(quantCoef + n));
 
             deQuantCoef1 = _mm_loadu_si128((__m128i*)(deQuantCoef + n));
             deQuantCoef2 = _mm_loadu_si128((__m128i*)(deQuantCoef + n + 4));
 
-            quantCoef12 = _mm_packs_epi32(quantCoef1, quantCoef2);
             sign = _mm_srai_epi16(quantCoef12, 15);
             quantCoef1 = _mm_unpacklo_epi16(quantCoef12, sign);
             quantCoef2 = _mm_unpackhi_epi16(quantCoef12, sign);
@@ -81,13 +79,11 @@
         {
             __m128i quantCoef1, quantCoef2, deQuantCoef1, deQuantCoef2, quantCoef12, sign;
 
-            quantCoef1 = _mm_loadu_si128((__m128i*)(quantCoef + n));
-            quantCoef2 = _mm_loadu_si128((__m128i*)(quantCoef + n + 4));
+            quantCoef12 = _mm_loadu_si128((__m128i*)(quantCoef + n));
 
             deQuantCoef1 = _mm_loadu_si128((__m128i*)(deQuantCoef + n));
             deQuantCoef2 = _mm_loadu_si128((__m128i*)(deQuantCoef + n + 4));
 
-            quantCoef12 = _mm_packs_epi32(quantCoef1, quantCoef2);
             sign = _mm_srai_epi16(quantCoef12, 15);
             quantCoef1 = _mm_unpacklo_epi16(quantCoef12, sign);
             quantCoef2 = _mm_unpackhi_epi16(quantCoef12, sign);
diff -r 71b094ee56aa -r babeba30ab42 source/test/mbdstharness.cpp
--- a/source/test/mbdstharness.cpp	Tue Sep 02 11:23:11 2014 +0530
+++ b/source/test/mbdstharness.cpp	Tue Sep 02 11:38:07 2014 +0530
@@ -185,12 +185,11 @@
 
         int cmp_size = sizeof(int) * height * width;
         int index1 = rand() % TEST_CASES;
-        int index2 = rand() % TEST_CASES;
 
-        ref(int_test_buff[index1] + j, mintbuf3, int_test_buff[index2] + j, width * height, per, shift);
-        checked(opt, int_test_buff[index1] + j, mintbuf4, int_test_buff[index2] + j, width * height, per, shift);
+        ref(short_test_buff[index1] + j, mintbuf3, mintbuf1, width * height, per, shift);
+        checked(opt, short_test_buff[index1] + j, mintbuf4, mintbuf2, width * height, per, shift);
 
-        if (memcmp(mintbuf3, mintbuf4, cmp_size))
+        if (memcmp(mintbuf1, mintbuf2, cmp_size))
             return false;
 
         reportfail();
@@ -384,6 +383,15 @@
         }
     }
 
+    if (opt.dequant_scaling)
+    {
+        if (!check_dequant_primitive(ref.dequant_scaling, opt.dequant_scaling))
+        {
+            printf("dequant_scaling: Failed!\n");
+            return false;
+        }
+    }
+
     return true;
 }
 
@@ -416,7 +424,7 @@
     if (opt.dequant_scaling)
     {
         printf("dequant_scaling\t");
-        REPORT_SPEEDUP(opt.dequant_scaling, ref.dequant_scaling, int_test_buff[0], mintbuf3, mintbuf4, 32 * 32, 5, 1);
+        REPORT_SPEEDUP(opt.dequant_scaling, ref.dequant_scaling, short_test_buff[0], mintbuf3, mintbuf4, 32 * 32, 5, 1);
     }
 
     if (opt.quant)


More information about the x265-devel mailing list