[x265] [PATCH] dequant_scaling optimization, downscaling quantCoef from int32_t* to int16_t*
praveen at multicorewareinc.com
praveen at multicorewareinc.com
Tue Sep 2 16:09:16 CEST 2014
# HG changeset patch
# User Praveen Tiwari
# Date 1409638087 -19800
# Node ID babeba30ab42c98b5ec124dc5e04567e9a4aa775
# Parent 71b094ee56aaa0adb6e25789d289844c0820f062
dequant_scaling optimization, downscaling quantCoef from int32_t* to int16_t*
diff -r 71b094ee56aa -r babeba30ab42 source/common/dct.cpp
--- a/source/common/dct.cpp Tue Sep 02 11:23:11 2014 +0530
+++ b/source/common/dct.cpp Tue Sep 02 11:38:07 2014 +0530
@@ -741,12 +741,11 @@
}
}
-void dequant_scaling_c(const int32_t* quantCoef, const int32_t *deQuantCoef, int32_t* coef, int num, int per, int shift)
+void dequant_scaling_c(const int16_t* quantCoef, const int32_t *deQuantCoef, int32_t* coef, int num, int per, int shift)
{
X265_CHECK(num <= 32 * 32, "dequant num %d too large\n", num);
int add, coeffQ;
- int clipQCoef;
shift += 4;
@@ -756,8 +755,7 @@
for (int n = 0; n < num; n++)
{
- clipQCoef = Clip3(-32768, 32767, quantCoef[n]);
- coeffQ = ((clipQCoef * deQuantCoef[n]) + add) >> (shift - per);
+ coeffQ = ((quantCoef[n] * deQuantCoef[n]) + add) >> (shift - per);
coef[n] = Clip3(-32768, 32767, coeffQ);
}
}
@@ -765,8 +763,7 @@
{
for (int n = 0; n < num; n++)
{
- clipQCoef = Clip3(-32768, 32767, quantCoef[n]);
- coeffQ = Clip3(-32768, 32767, clipQCoef * deQuantCoef[n]);
+ coeffQ = Clip3(-32768, 32767, quantCoef[n] * deQuantCoef[n]);
coef[n] = Clip3(-32768, 32767, coeffQ << (per - shift));
}
}
diff -r 71b094ee56aa -r babeba30ab42 source/common/primitives.h
--- a/source/common/primitives.h Tue Sep 02 11:23:11 2014 +0530
+++ b/source/common/primitives.h Tue Sep 02 11:38:07 2014 +0530
@@ -164,7 +164,7 @@
typedef void (*transpose_t)(pixel* dst, pixel* src, intptr_t stride);
typedef uint32_t (*quant_t)(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff);
typedef uint32_t (*nquant_t)(int32_t *coef, int32_t *quantCoeff, int32_t *qCoef, int qBits, int add, int numCoeff);
-typedef void (*dequant_scaling_t)(const int32_t* src, const int32_t *dequantCoef, int32_t* dst, int num, int mcqp_miper, int shift);
+typedef void (*dequant_scaling_t)(const int16_t* src, const int32_t *dequantCoef, int32_t* dst, int num, int mcqp_miper, int shift);
typedef void (*dequant_normal_t)(const int16_t* quantCoef, int32_t* coef, int num, int scale, int shift);
typedef int (*count_nonzero_t)(const int16_t *quantCoeff, int numCoeff);
diff -r 71b094ee56aa -r babeba30ab42 source/common/quant.cpp
--- a/source/common/quant.cpp Tue Sep 02 11:23:11 2014 +0530
+++ b/source/common/quant.cpp Tue Sep 02 11:38:07 2014 +0530
@@ -423,7 +423,7 @@
{
int scalingListType = (bIntra ? 0 : 3) + ttype;
int32_t *dequantCoef = m_scalingList->m_dequantCoef[log2TrSize - 2][scalingListType][rem];
- primitives.dequant_scaling(coeff, dequantCoef, m_resiDctCoeff, numCoeff, per, shift);
+ primitives.dequant_scaling(qCoeff, dequantCoef, m_resiDctCoeff, numCoeff, per, shift);
}
else
{
diff -r 71b094ee56aa -r babeba30ab42 source/common/vec/dct-sse41.cpp
--- a/source/common/vec/dct-sse41.cpp Tue Sep 02 11:23:11 2014 +0530
+++ b/source/common/vec/dct-sse41.cpp Tue Sep 02 11:38:07 2014 +0530
@@ -36,7 +36,7 @@
using namespace x265;
namespace {
-void dequant_scaling(const int32_t* quantCoef, const int32_t *deQuantCoef, int32_t* coef, int num, int per, int shift)
+void dequant_scaling(const int16_t* quantCoef, const int32_t *deQuantCoef, int32_t* coef, int num, int per, int shift)
{
X265_CHECK(num <= 32 * 32, "dequant num too large\n");
@@ -53,13 +53,11 @@
{
__m128i quantCoef1, quantCoef2, deQuantCoef1, deQuantCoef2, quantCoef12, sign;
- quantCoef1 = _mm_loadu_si128((__m128i*)(quantCoef + n));
- quantCoef2 = _mm_loadu_si128((__m128i*)(quantCoef + n + 4));
+ quantCoef12 = _mm_loadu_si128((__m128i*)(quantCoef + n));
deQuantCoef1 = _mm_loadu_si128((__m128i*)(deQuantCoef + n));
deQuantCoef2 = _mm_loadu_si128((__m128i*)(deQuantCoef + n + 4));
- quantCoef12 = _mm_packs_epi32(quantCoef1, quantCoef2);
sign = _mm_srai_epi16(quantCoef12, 15);
quantCoef1 = _mm_unpacklo_epi16(quantCoef12, sign);
quantCoef2 = _mm_unpackhi_epi16(quantCoef12, sign);
@@ -81,13 +79,11 @@
{
__m128i quantCoef1, quantCoef2, deQuantCoef1, deQuantCoef2, quantCoef12, sign;
- quantCoef1 = _mm_loadu_si128((__m128i*)(quantCoef + n));
- quantCoef2 = _mm_loadu_si128((__m128i*)(quantCoef + n + 4));
+ quantCoef12 = _mm_loadu_si128((__m128i*)(quantCoef + n));
deQuantCoef1 = _mm_loadu_si128((__m128i*)(deQuantCoef + n));
deQuantCoef2 = _mm_loadu_si128((__m128i*)(deQuantCoef + n + 4));
- quantCoef12 = _mm_packs_epi32(quantCoef1, quantCoef2);
sign = _mm_srai_epi16(quantCoef12, 15);
quantCoef1 = _mm_unpacklo_epi16(quantCoef12, sign);
quantCoef2 = _mm_unpackhi_epi16(quantCoef12, sign);
diff -r 71b094ee56aa -r babeba30ab42 source/test/mbdstharness.cpp
--- a/source/test/mbdstharness.cpp Tue Sep 02 11:23:11 2014 +0530
+++ b/source/test/mbdstharness.cpp Tue Sep 02 11:38:07 2014 +0530
@@ -185,12 +185,11 @@
int cmp_size = sizeof(int) * height * width;
int index1 = rand() % TEST_CASES;
- int index2 = rand() % TEST_CASES;
- ref(int_test_buff[index1] + j, mintbuf3, int_test_buff[index2] + j, width * height, per, shift);
- checked(opt, int_test_buff[index1] + j, mintbuf4, int_test_buff[index2] + j, width * height, per, shift);
+ ref(short_test_buff[index1] + j, mintbuf3, mintbuf1, width * height, per, shift);
+ checked(opt, short_test_buff[index1] + j, mintbuf4, mintbuf2, width * height, per, shift);
- if (memcmp(mintbuf3, mintbuf4, cmp_size))
+ if (memcmp(mintbuf1, mintbuf2, cmp_size))
return false;
reportfail();
@@ -384,6 +383,15 @@
}
}
+ if (opt.dequant_scaling)
+ {
+ if (!check_dequant_primitive(ref.dequant_scaling, opt.dequant_scaling))
+ {
+ printf("dequant_scaling: Failed!\n");
+ return false;
+ }
+ }
+
return true;
}
@@ -416,7 +424,7 @@
if (opt.dequant_scaling)
{
printf("dequant_scaling\t");
- REPORT_SPEEDUP(opt.dequant_scaling, ref.dequant_scaling, int_test_buff[0], mintbuf3, mintbuf4, 32 * 32, 5, 1);
+ REPORT_SPEEDUP(opt.dequant_scaling, ref.dequant_scaling, short_test_buff[0], mintbuf3, mintbuf4, 32 * 32, 5, 1);
}
if (opt.quant)
More information about the x265-devel
mailing list