<div dir="ltr"><br><div class="gmail_extra"><br><br><div class="gmail_quote">On Thu, Oct 17, 2013 at 10:32 AM, <span dir="ltr"><<a href="mailto:dnyaneshwar@multicorewareinc.com" target="_blank">dnyaneshwar@multicorewareinc.com</a>></span> wrote:<br>
<blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"># HG changeset patch<br>
# User Dnyaneshwar Gorade <<a href="mailto:dnyaneshwar@multicorewareinc.com">dnyaneshwar@multicorewareinc.com</a>><br>
# Date 1382023822 -19800<br>
# Thu Oct 17 21:00:22 2013 +0530<br>
# Node ID 4dbd17ef69db91b5604f9c5cc6a4a62f15b91ab0<br>
# Parent f6d04c660b9bb1b0cf6274faf514be77148aa312<br>
added cvt32to16_shr function to testbench.<br>
<br>
diff -r f6d04c660b9b -r 4dbd17ef69db source/Lib/TLibCommon/TComTrQuant.cpp<br>
--- a/source/Lib/TLibCommon/TComTrQuant.cpp Thu Oct 17 20:34:48 2013 +0530<br>
+++ b/source/Lib/TLibCommon/TComTrQuant.cpp Thu Oct 17 21:00:22 2013 +0530<br>
@@ -493,7 +493,7 @@<br>
transformSkipShift = shift;<br>
for (j = 0; j < height; j++)<br>
{<br>
- primitives.cvt32to16_shr(&residual[j * stride], &coef[j * width], shift, width);<br>
+ primitives.cvt32to16_shr(residual, coef, stride, shift, width);<br></blockquote><div><br></div><div>you should explain in the commit message why a stride argument is now required</div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
}<br>
}<br>
else<br>
diff -r f6d04c660b9b -r 4dbd17ef69db source/common/pixel.cpp<br>
--- a/source/common/pixel.cpp Thu Oct 17 20:34:48 2013 +0530<br>
+++ b/source/common/pixel.cpp Thu Oct 17 21:00:22 2013 +0530<br>
@@ -439,13 +439,18 @@<br>
}<br>
}<br>
<br>
-void convert32to16_shr(short *dst, int *src, int shift, int num)<br>
+void convert32to16_shr(short *dst, int *src, intptr_t stride, int shift, int size)<br>
{<br>
int round = 1 << (shift - 1);<br>
<br>
- for (int i = 0; i < num; i++)<br>
+ for (int i = 0; i < size; i++)<br>
{<br>
- dst[i] = (short)((src[i] + round) >> shift);<br>
+ for (int j = 0; j < size; j++)<br>
+ {<br>
+ dst[j] = (short)((src[j] + round) >> shift);<br>
+ }<br>
+ src += size;<br>
+ dst += stride;<br>
}<br>
}<br>
<br>
diff -r f6d04c660b9b -r 4dbd17ef69db source/common/primitives.h<br>
--- a/source/common/primitives.h Thu Oct 17 20:34:48 2013 +0530<br>
+++ b/source/common/primitives.h Thu Oct 17 21:00:22 2013 +0530<br>
@@ -179,7 +179,7 @@<br>
<br>
typedef void (*cvt16to32_shl_t)(int *dst, short *src, intptr_t, int, int);<br>
typedef void (*cvt16to16_shl_t)(short *dst, short *src, int, int, intptr_t, int);<br>
-typedef void (*cvt32to16_shr_t)(short *dst, int *src, int, int);<br>
+typedef void (*cvt32to16_shr_t)(short *dst, int *src, intptr_t, int, int);<br>
<br>
typedef void (*dct_t)(short *src, int *dst, intptr_t stride);<br>
typedef void (*idct_t)(int *src, short *dst, intptr_t stride);<br>
diff -r f6d04c660b9b -r 4dbd17ef69db source/common/vec/pixel-sse3.cpp<br>
--- a/source/common/vec/pixel-sse3.cpp Thu Oct 17 20:34:48 2013 +0530<br>
+++ b/source/common/vec/pixel-sse3.cpp Thu Oct 17 21:00:22 2013 +0530<br>
@@ -31,23 +31,25 @@<br>
using namespace x265;<br>
<br>
namespace {<br>
-void convert32to16_shr(short *dst, int *org, int shift, int num)<br>
+void convert32to16_shr(short *dst, int *org, intptr_t stride, int shift, int size)<br>
{<br>
- int i;<br>
+ int i, j;<br>
__m128i round = _mm_set1_epi32(1 << (shift - 1));<br>
<br>
- for (i = 0; i < num; i += 4)<br>
+ for (i = 0; i < size; i++)<br>
{<br>
- __m128i im32;<br>
- __m128i im16;<br>
-<br>
- im32 = _mm_loadu_si128((__m128i const*)org);<br>
- im32 = _mm_sra_epi32(_mm_add_epi32(im32, round), _mm_cvtsi32_si128(shift));<br>
- im16 = _mm_packs_epi32(im32, im32);<br>
- _mm_storeu_si128((__m128i*)dst, im16);<br>
-<br>
- org += 4;<br>
- dst += 4;<br>
+ for (j = 0; j < size; j += 4)<br>
+ {<br>
+ __m128i im32;<br>
+ __m128i im16;<br>
+<br>
+ im32 = _mm_loadu_si128((__m128i const*)(org + j));<br>
+ im32 = _mm_sra_epi32(_mm_add_epi32(im32, round), _mm_cvtsi32_si128(shift));<br>
+ im16 = _mm_packs_epi32(im32, im32);<br>
+ _mm_storel_epi64((__m128i*)(dst + j), im16);<br>
+ }<br>
+ org += size;<br>
+ dst += stride;<br>
}<br>
}<br>
<br>
@@ -639,6 +641,7 @@<br>
//p.cvt32to16_shr = convert32to16_shr;<br>
p.cvt16to32_shl = convert16to32_shl;<br>
p.cvt16to16_shl = convert16to16_shl;<br>
+ p.cvt32to16_shr = convert32to16_shr;<br>
<br>
#if !HIGH_BIT_DEPTH<br>
p.transpose[0] = transpose4;<br>
diff -r f6d04c660b9b -r 4dbd17ef69db source/test/pixelharness.cpp<br>
--- a/source/test/pixelharness.cpp Thu Oct 17 20:34:48 2013 +0530<br>
+++ b/source/test/pixelharness.cpp Thu Oct 17 21:00:22 2013 +0530<br>
@@ -494,6 +494,39 @@<br>
return true;<br>
}<br>
<br>
+bool PixelHarness::check_cvt32to16_shr_t(cvt32to16_shr_t ref, cvt32to16_shr_t opt)<br>
+{<br>
+ int bufsize = STRIDE * STRIDE;<br>
+ int* src = (int*)X265_MALLOC(int, bufsize);<br></blockquote><div><br></div><div>Does this really need a new buffer? Can you just use the test buffers already allocated? If not, then all this needs to be in the pixel harness constructor</div>
<div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
+<br>
+ int shift = (rand() % 7 + 1);<br>
+<br>
+ if (!src)<br>
+ {<br>
+ fprintf(stderr, "malloc failed, unable to initiate tests!\n");<br>
+ exit(1);<br>
+ }<br>
+<br>
+ for (int i = 0; i < bufsize; i++)<br>
+ {<br>
+ src[i] = (rand() & (2 * SHORT_MAX + 1)) - SHORT_MAX - 1;<br>
+ }<br>
+<br>
+ ALIGN_VAR_16(short, ref_dest[64 * 64]);<br>
+ ALIGN_VAR_16(short, opt_dest[64 * 64]);<br>
+<br>
+ memset(ref_dest, 0, 64 * 64 * sizeof(short));<br>
+ memset(opt_dest, 0, 64 * 64 * sizeof(short));<br>
+<br>
+ opt(opt_dest, src, STRIDE, shift, STRIDE);<br>
+ ref(ref_dest, src, STRIDE, shift, STRIDE);<br>
+<br>
+ if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(short)))<br>
+ return false;<br></blockquote><div><br></div><div>white-space</div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
+<br>
+ return true;<br>
+}<br>
+<br>
bool PixelHarness::check_pixelavg_pp(pixelavg_pp_t ref, pixelavg_pp_t opt)<br>
{<br>
ALIGN_VAR_16(pixel, ref_dest[64 * 64]);<br>
@@ -665,7 +698,14 @@<br>
}<br>
}<br>
}<br>
-<br>
+ if(opt.cvt32to16_shr)<br></blockquote><div><br></div><div>white-space</div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
+ {<br>
+ if (!check_cvt32to16_shr_t(ref.cvt32to16_shr, opt.cvt32to16_shr))<br>
+ {<br>
+ printf("cvt32to16 failed!\n");<br>
+ return false;<br>
+ }<br>
+ }<br></blockquote><div><br></div><div>You also need to add this primitive to the function which compares performance between optimized and C ref</div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
if (opt.blockcpy_pp)<br>
{<br>
if (!check_block_copy(ref.blockcpy_pp, opt.blockcpy_pp))<br>
diff -r f6d04c660b9b -r 4dbd17ef69db source/test/pixelharness.h<br>
--- a/source/test/pixelharness.h Thu Oct 17 20:34:48 2013 +0530<br>
+++ b/source/test/pixelharness.h Thu Oct 17 21:00:22 2013 +0530<br>
@@ -53,6 +53,7 @@<br>
bool check_pixeladd_pp(pixeladd_pp_t ref, pixeladd_pp_t opt);<br>
bool check_downscale_t(downscale_t ref, downscale_t opt);<br>
bool check_pixelavg_pp(pixelavg_pp_t ref, pixelavg_pp_t opt);<br>
+ bool check_cvt32to16_shr_t(cvt32to16_shr_t ref, cvt32to16_shr_t opt);<br>
<br>
public:<br>
<br>
_______________________________________________<br>
x265-devel mailing list<br>
<a href="mailto:x265-devel@videolan.org">x265-devel@videolan.org</a><br>
<a href="https://mailman.videolan.org/listinfo/x265-devel" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a><br>
</blockquote></div><br><br clear="all"><div><br></div>-- <br>Steve Borho
</div></div>