<div dir="ltr"><br><div class="gmail_extra"><br><br><div class="gmail_quote">On Thu, Oct 17, 2013 at 10:32 AM,  <span dir="ltr"><<a href="mailto:dnyaneshwar@multicorewareinc.com" target="_blank">dnyaneshwar@multicorewareinc.com</a>></span> wrote:<br>

<blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"># HG changeset patch<br>

# User Dnyaneshwar Gorade <<a href="mailto:dnyaneshwar@multicorewareinc.com">dnyaneshwar@multicorewareinc.com</a>><br>

# Date 1382023822 -19800<br>

#      Thu Oct 17 21:00:22 2013 +0530<br>

# Node ID 4dbd17ef69db91b5604f9c5cc6a4a62f15b91ab0<br>

# Parent  f6d04c660b9bb1b0cf6274faf514be77148aa312<br>

added cvt32to16_shr function to testbench.<br>

<br>

diff -r f6d04c660b9b -r 4dbd17ef69db source/Lib/TLibCommon/TComTrQuant.cpp<br>

--- a/source/Lib/TLibCommon/TComTrQuant.cpp     Thu Oct 17 20:34:48 2013 +0530<br>

+++ b/source/Lib/TLibCommon/TComTrQuant.cpp     Thu Oct 17 21:00:22 2013 +0530<br>

@@ -493,7 +493,7 @@<br>

         transformSkipShift = shift;<br>

         for (j = 0; j < height; j++)<br>

         {<br>

-            primitives.cvt32to16_shr(&residual[j * stride], &coef[j * width], shift, width);<br>

+            primitives.cvt32to16_shr(residual, coef, stride, shift, width);<br></blockquote><div><br></div><div>you should explain in the commit message why a stride argument is now required</div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">


         }<br>

     }<br>

     else<br>

diff -r f6d04c660b9b -r 4dbd17ef69db source/common/pixel.cpp<br>

--- a/source/common/pixel.cpp   Thu Oct 17 20:34:48 2013 +0530<br>

+++ b/source/common/pixel.cpp   Thu Oct 17 21:00:22 2013 +0530<br>

@@ -439,13 +439,18 @@<br>

     }<br>

 }<br>

<br>

-void convert32to16_shr(short *dst, int *src, int shift, int num)<br>

+void convert32to16_shr(short *dst, int *src, intptr_t stride, int shift, int size)<br>

 {<br>

     int round = 1 << (shift - 1);<br>

<br>

-    for (int i = 0; i < num; i++)<br>

+    for (int i = 0; i < size; i++)<br>

     {<br>

-        dst[i] = (short)((src[i] + round) >> shift);<br>

+        for (int j = 0; j < size; j++)<br>

+        {<br>

+            dst[j] = (short)((src[j] + round) >> shift);<br>

+        }<br>

+        src += size;<br>

+        dst += stride;<br>

     }<br>

 }<br>

<br>

diff -r f6d04c660b9b -r 4dbd17ef69db source/common/primitives.h<br>

--- a/source/common/primitives.h        Thu Oct 17 20:34:48 2013 +0530<br>

+++ b/source/common/primitives.h        Thu Oct 17 21:00:22 2013 +0530<br>

@@ -179,7 +179,7 @@<br>

<br>

 typedef void (*cvt16to32_shl_t)(int *dst, short *src, intptr_t, int, int);<br>

 typedef void (*cvt16to16_shl_t)(short *dst, short *src, int, int, intptr_t, int);<br>

-typedef void (*cvt32to16_shr_t)(short *dst, int *src, int, int);<br>

+typedef void (*cvt32to16_shr_t)(short *dst, int *src, intptr_t, int, int);<br>

<br>

 typedef void (*dct_t)(short *src, int *dst, intptr_t stride);<br>

 typedef void (*idct_t)(int *src, short *dst, intptr_t stride);<br>

diff -r f6d04c660b9b -r 4dbd17ef69db source/common/vec/pixel-sse3.cpp<br>

--- a/source/common/vec/pixel-sse3.cpp  Thu Oct 17 20:34:48 2013 +0530<br>

+++ b/source/common/vec/pixel-sse3.cpp  Thu Oct 17 21:00:22 2013 +0530<br>

@@ -31,23 +31,25 @@<br>

 using namespace x265;<br>

<br>

 namespace {<br>

-void convert32to16_shr(short *dst, int *org, int shift, int num)<br>

+void convert32to16_shr(short *dst, int *org, intptr_t stride, int shift, int size)<br>

 {<br>

-    int i;<br>

+    int i, j;<br>

     __m128i round = _mm_set1_epi32(1 << (shift - 1));<br>

<br>

-    for (i = 0; i < num; i += 4)<br>

+    for (i = 0; i < size; i++)<br>

     {<br>

-        __m128i im32;<br>

-        __m128i im16;<br>

-<br>

-        im32 = _mm_loadu_si128((__m128i const*)org);<br>

-        im32 = _mm_sra_epi32(_mm_add_epi32(im32, round), _mm_cvtsi32_si128(shift));<br>

-        im16 = _mm_packs_epi32(im32, im32);<br>

-        _mm_storeu_si128((__m128i*)dst, im16);<br>

-<br>

-        org += 4;<br>

-        dst += 4;<br>

+        for (j = 0; j < size; j += 4)<br>

+        {<br>

+            __m128i im32;<br>

+            __m128i im16;<br>

+<br>

+            im32 = _mm_loadu_si128((__m128i const*)(org + j));<br>

+            im32 = _mm_sra_epi32(_mm_add_epi32(im32, round), _mm_cvtsi32_si128(shift));<br>

+            im16 = _mm_packs_epi32(im32, im32);<br>

+            _mm_storel_epi64((__m128i*)(dst + j), im16);<br>

+        }<br>

+        org += size;<br>

+        dst += stride;<br>

     }<br>

 }<br>

<br>

@@ -639,6 +641,7 @@<br>

     //p.cvt32to16_shr = convert32to16_shr;<br>

     p.cvt16to32_shl = convert16to32_shl;<br>

     p.cvt16to16_shl = convert16to16_shl;<br>

+    p.cvt32to16_shr = convert32to16_shr;<br>

<br>

 #if !HIGH_BIT_DEPTH<br>

     p.transpose[0] = transpose4;<br>

diff -r f6d04c660b9b -r 4dbd17ef69db source/test/pixelharness.cpp<br>

--- a/source/test/pixelharness.cpp      Thu Oct 17 20:34:48 2013 +0530<br>

+++ b/source/test/pixelharness.cpp      Thu Oct 17 21:00:22 2013 +0530<br>

@@ -494,6 +494,39 @@<br>

     return true;<br>

 }<br>

<br>

+bool PixelHarness::check_cvt32to16_shr_t(cvt32to16_shr_t ref, cvt32to16_shr_t opt)<br>

+{<br>

+    int bufsize = STRIDE * STRIDE;<br>

+    int* src = (int*)X265_MALLOC(int, bufsize);<br></blockquote><div><br></div><div>Does this really need a new buffer?  Can you just use the test buffers already allocated?  If not, then all this needs to be in the pixel harness constructor</div>

<div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">

+<br>

+    int  shift =  (rand() % 7 + 1);<br>

+<br>

+    if (!src)<br>

+    {<br>

+        fprintf(stderr, "malloc failed, unable to initiate tests!\n");<br>

+        exit(1);<br>

+    }<br>

+<br>

+    for (int i = 0; i < bufsize; i++)<br>

+    {<br>

+        src[i] = (rand() & (2 * SHORT_MAX + 1)) - SHORT_MAX - 1;<br>

+    }<br>

+<br>

+    ALIGN_VAR_16(short, ref_dest[64 * 64]);<br>

+    ALIGN_VAR_16(short, opt_dest[64 * 64]);<br>

+<br>

+    memset(ref_dest, 0, 64 * 64 * sizeof(short));<br>

+    memset(opt_dest, 0, 64 * 64 * sizeof(short));<br>

+<br>

+    opt(opt_dest, src, STRIDE, shift, STRIDE);<br>

+    ref(ref_dest, src, STRIDE, shift, STRIDE);<br>

+<br>

+        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(short)))<br>

+            return false;<br></blockquote><div><br></div><div>white-space</div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">

+<br>

+        return true;<br>

+}<br>

+<br>

 bool PixelHarness::check_pixelavg_pp(pixelavg_pp_t ref, pixelavg_pp_t opt)<br>

 {<br>

     ALIGN_VAR_16(pixel, ref_dest[64 * 64]);<br>

@@ -665,7 +698,14 @@<br>

             }<br>

         }<br>

     }<br>

-<br>

+    if(opt.cvt32to16_shr)<br></blockquote><div><br></div><div>white-space</div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">

+    {<br>

+        if (!check_cvt32to16_shr_t(ref.cvt32to16_shr, opt.cvt32to16_shr))<br>

+        {<br>

+            printf("cvt32to16 failed!\n");<br>

+            return false;<br>

+        }<br>

+    }<br></blockquote><div><br></div><div>You also need to add this primitive to the function which compares performance between optimized and C ref</div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">


     if (opt.blockcpy_pp)<br>

     {<br>

         if (!check_block_copy(ref.blockcpy_pp, opt.blockcpy_pp))<br>

diff -r f6d04c660b9b -r 4dbd17ef69db source/test/pixelharness.h<br>

--- a/source/test/pixelharness.h        Thu Oct 17 20:34:48 2013 +0530<br>

+++ b/source/test/pixelharness.h        Thu Oct 17 21:00:22 2013 +0530<br>

@@ -53,6 +53,7 @@<br>

     bool check_pixeladd_pp(pixeladd_pp_t ref, pixeladd_pp_t opt);<br>

     bool check_downscale_t(downscale_t ref, downscale_t opt);<br>

     bool check_pixelavg_pp(pixelavg_pp_t ref, pixelavg_pp_t opt);<br>

+    bool check_cvt32to16_shr_t(cvt32to16_shr_t ref, cvt32to16_shr_t opt);<br>

<br>

 public:<br>

<br>

_______________________________________________<br>

x265-devel mailing list<br>

<a href="mailto:x265-devel@videolan.org">x265-devel@videolan.org</a><br>

<a href="https://mailman.videolan.org/listinfo/x265-devel" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a><br>

</blockquote></div><br><br clear="all"><div><br></div>-- <br>Steve Borho

</div></div>