[x265] [PATCH] conv16to32_count C interface modification, downscaling coeff from int32_t* to int16_t*
praveen at multicorewareinc.com
praveen at multicorewareinc.com
Tue Sep 2 16:11:54 CEST 2014
# HG changeset patch
# User Praveen Tiwari
# Date 1409646842 -19800
# Node ID a70b4e57aac2c535add2de15145c2a86638116f4
# Parent 51b5a6d820da97a4178dc42d2ef98ffe1970511b
conv16to32_count C interface modification, downscaling coeff from int32_t* to int16_t*
diff -r 51b5a6d820da -r a70b4e57aac2 source/common/dct.cpp
--- a/source/common/dct.cpp Mon Sep 01 17:07:05 2014 +0530
+++ b/source/common/dct.cpp Tue Sep 02 14:04:02 2014 +0530
@@ -827,7 +827,7 @@
}
template<int trSize>
-uint32_t conv16to32_count(coeff_t* coeff, int16_t* residual, intptr_t stride)
+uint32_t conv16to32_count(int16_t* coeff, int16_t* residual, intptr_t stride)
{
uint32_t numSig = 0;
for (int k = 0; k < trSize; k++)
diff -r 51b5a6d820da -r a70b4e57aac2 source/common/primitives.h
--- a/source/common/primitives.h Mon Sep 01 17:07:05 2014 +0530
+++ b/source/common/primitives.h Tue Sep 02 14:04:02 2014 +0530
@@ -153,7 +153,7 @@
typedef void (*cvt16to32_shr_t)(int32_t *dst, int16_t *src, intptr_t, int, int);
typedef void (*cvt32to16_shr_t)(int16_t *dst, int32_t *src, intptr_t, int, int);
typedef void (*cvt32to16_shl_t)(int16_t *dst, int32_t *src, intptr_t, int);
-typedef uint32_t (*cvt16to32_cnt_t)(coeff_t* coeff, int16_t* residual, intptr_t stride);
+typedef uint32_t (*cvt16to32_cnt_t)(int16_t* coeff, int16_t* residual, intptr_t stride);
typedef void (*dct_t)(int16_t *src, int32_t *dst, intptr_t stride);
typedef void (*idct_t)(int32_t *src, int16_t *dst, intptr_t stride);
diff -r 51b5a6d820da -r a70b4e57aac2 source/common/quant.cpp
--- a/source/common/quant.cpp Mon Sep 01 17:07:05 2014 +0530
+++ b/source/common/quant.cpp Tue Sep 02 14:04:02 2014 +0530
@@ -315,7 +315,23 @@
if (cu->getCUTransquantBypass(absPartIdx))
{
X265_CHECK(log2TrSize >= 2 && log2TrSize <= 5, "Block size mistake!\n");
- return primitives.cvt16to32_cnt[log2TrSize - 2](coeff, residual, stride);
+ /* This section of code is to safely convert int32_t coefficients to int16_t, once the caller function is
+ * optimize to take coefficients as int16_t*, it will be cleanse.*/
+ int numCoeff = 1 << log2TrSize * 2;
+ ALIGN_VAR_16(int16_t, qCoeff[32 * 32]);
+ for (int i = 0; i < numCoeff; i++)
+ {
+ qCoeff[i] = (int16_t)Clip3(-32768, 32767, coeff[i]);
+ }
+ int numSign = primitives.cvt16to32_cnt[log2TrSize - 2](qCoeff, residual, stride);
+
+ /* This section of code is to safely convert int16_t coefficients to int32_t, once the caller function is
+ * optimize to take coefficients as int16_t*, it will be cleanse.*/
+ for (int i = 0; i < numCoeff; i++)
+ {
+ coeff[i] = qCoeff[i];
+ }
+ return numSign;
}
bool isLuma = ttype == TEXT_LUMA;
diff -r 51b5a6d820da -r a70b4e57aac2 source/common/x86/blockcopy8.h
--- a/source/common/x86/blockcopy8.h Mon Sep 01 17:07:05 2014 +0530
+++ b/source/common/x86/blockcopy8.h Tue Sep 02 14:04:02 2014 +0530
@@ -38,14 +38,14 @@
void x265_cvt16to32_shr_8_sse4(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t);
void x265_cvt16to32_shr_16_sse4(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t);
void x265_cvt16to32_shr_32_sse4(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t);
-uint32_t x265_cvt16to32_cnt_4_sse4(int32_t * dst, int16_t * src, intptr_t);
-uint32_t x265_cvt16to32_cnt_8_sse4(int32_t * dst, int16_t * src, intptr_t);
-uint32_t x265_cvt16to32_cnt_16_sse4(int32_t * dst, int16_t * src, intptr_t);
-uint32_t x265_cvt16to32_cnt_32_sse4(int32_t * dst, int16_t * src, intptr_t);
-uint32_t x265_cvt16to32_cnt_4_avx2(int32_t * dst, int16_t * src, intptr_t);
-uint32_t x265_cvt16to32_cnt_8_avx2(int32_t * dst, int16_t * src, intptr_t);
-uint32_t x265_cvt16to32_cnt_16_avx2(int32_t * dst, int16_t * src, intptr_t);
-uint32_t x265_cvt16to32_cnt_32_avx2(int32_t * dst, int16_t * src, intptr_t);
+uint32_t x265_cvt16to32_cnt_4_sse4(int16_t * dst, int16_t * src, intptr_t);
+uint32_t x265_cvt16to32_cnt_8_sse4(int16_t * dst, int16_t * src, intptr_t);
+uint32_t x265_cvt16to32_cnt_16_sse4(int16_t * dst, int16_t * src, intptr_t);
+uint32_t x265_cvt16to32_cnt_32_sse4(int16_t * dst, int16_t * src, intptr_t);
+uint32_t x265_cvt16to32_cnt_4_avx2(int16_t * dst, int16_t * src, intptr_t);
+uint32_t x265_cvt16to32_cnt_8_avx2(int16_t * dst, int16_t * src, intptr_t);
+uint32_t x265_cvt16to32_cnt_16_avx2(int16_t * dst, int16_t * src, intptr_t);
+uint32_t x265_cvt16to32_cnt_32_avx2(int16_t * dst, int16_t * src, intptr_t);
#define SETUP_BLOCKCOPY_FUNC(W, H, cpu) \
void x265_blockcopy_pp_ ## W ## x ## H ## cpu(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb); \
diff -r 51b5a6d820da -r a70b4e57aac2 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Mon Sep 01 17:07:05 2014 +0530
+++ b/source/test/pixelharness.cpp Tue Sep 02 14:04:02 2014 +0530
@@ -582,8 +582,8 @@
bool PixelHarness::check_cvt16to32_cnt_t(cvt16to32_cnt_t ref, cvt16to32_cnt_t opt)
{
- ALIGN_VAR_16(int32_t, ref_dest[64 * 64]);
- ALIGN_VAR_16(int32_t, opt_dest[64 * 64]);
+ ALIGN_VAR_16(int16_t, ref_dest[64 * 64]);
+ ALIGN_VAR_16(int16_t, opt_dest[64 * 64]);
memset(ref_dest, 0xCD, sizeof(ref_dest));
memset(opt_dest, 0xCD, sizeof(opt_dest));
@@ -1782,7 +1782,7 @@
if ((i < BLOCK_64x64) && opt.cvt16to32_cnt[i])
{
HEADER("cvt16to32_cnt[%dx%d]", 4 << i, 4 << i);
- REPORT_SPEEDUP(opt.cvt16to32_cnt[i], ref.cvt16to32_cnt[i], ibuf1, sbuf2, STRIDE);
+ REPORT_SPEEDUP(opt.cvt16to32_cnt[i], ref.cvt16to32_cnt[i], sbuf1, sbuf2, STRIDE);
}
}
More information about the x265-devel
mailing list