<div dir="ltr"><span style="font-family:arial,sans-serif;font-size:13px">-->  This still doesn't look optimized at all, it's just unrolled.</span><div><font face="arial, sans-serif"><br></font></div><div><font face="arial, sans-serif">we have unrolled due to  __int64 used in HM code so our data loading and unloading taking much time than the reducing calculations, so we have unrolled and vectorized few suitable calculations. <br>
</font><div><span style="font-family:arial,sans-serif;font-size:13px"><br></span></div></div><div><font face="arial, sans-serif">--> </font><span style="font-family:arial,sans-serif;font-size:13px">use uint64_t here.  __int64 is a Microsoft data type.  Is 64bits really necessary?</span></div>
<div><span style="font-family:arial,sans-serif;font-size:13px"><br></span></div><div style><span style="font-family:arial,sans-serif;font-size:13px">Initial i replaced __int64 with int and it seems __int64 is not required but i need to run few more tests and then we can vectorized the whole code.</span></div>
<div><font face="arial, sans-serif"><br></font></div></div><div class="gmail_extra"><br><br><div class="gmail_quote">On Fri, Jun 28, 2013 at 12:36 AM, Steve Borho <span dir="ltr"><<a href="mailto:steve@borho.org" target="_blank">steve@borho.org</a>></span> wrote:<br>
<blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><div dir="ltr"><br><div class="gmail_extra"><br><br><div class="gmail_quote"><div class="im">On Thu, Jun 27, 2013 at 7:27 AM,  <span dir="ltr"><<a href="mailto:praveen@multicorewareinc.com" target="_blank">praveen@multicorewareinc.com</a>></span> wrote:<br>

<blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left-width:1px;border-left-color:rgb(204,204,204);border-left-style:solid;padding-left:1ex"># HG changeset patch<br>
# User praveentiwari<br>
# Date 1372336062 -19800<br>
# Node ID 2e227fd23fe25e9fe6dfcca2f1dac21474f4a7a0<br>
# Parent  321b2fd70a1bd58b2bb1c2351f49766709a15770<br>
Vector code for xCalQuantCoefEAdp<br></blockquote><div><br></div></div><div>This looks ok, but it doesn't match the fixes I made to the C primitive.  Details below.</div><div class="im"><div> </div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left-width:1px;border-left-color:rgb(204,204,204);border-left-style:solid;padding-left:1ex">


diff -r 321b2fd70a1b -r 2e227fd23fe2 source/common/vec/dct.inc<br>
--- a/source/common/vec/dct.inc Wed Jun 26 17:42:39 2013 +0530<br>
+++ b/source/common/vec/dct.inc Thu Jun 27 17:57:42 2013 +0530<br>
@@ -39,7 +39,6 @@<br>
 extern void fastForwardDst(Short *block, Short *coeff, Int shift);<br>
<br>
 namespace {<br>
-<br>
 /* Used for filter */<br>
 #define IF_INTERNAL_PREC 14 ///< Number of bits for internal precision<br>
 #define IF_FILTER_PREC    6 ///< Log2 of sum of filter taps<br>
@@ -3938,6 +3937,131 @@<br>
 #undef STROE_LINE<br>
     }<br>
 }<br><br></blockquote><div><br></div></div><div>use uint32_t for unsigned int.  and xCalQuantCoefEAdp should be quantaq</div><div class="im"><div> </div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left-width:1px;border-left-color:rgb(204,204,204);border-left-style:solid;padding-left:1ex">


+unsigned int xCalQuantCoefEAdp(int * coef,<br>
+                               int * quantCoeff,<br>
+                               int * deltaU,<br>
+                               int * qCoef,<br>
+                               int * arlCCoef,<br>
+                               int   qBitsC,<br>
+                               int   qBits,<br>
+                               int   add,<br>
+                               int   numCoeff)<br>
+{<br>
+    int addc   = 1 << (qBitsC - 1);<br>
+    int qBits8 = qBits - 8;<br>
+    unsigned int acSum = 0;<br>
+    int dstOffset = 0;<br>
+<br>
+    for (int blockpos = 0; blockpos < numCoeff; blockpos++)<br>
+    {<br>
+        int level1;<br>
+        int  sign1;<br>
+        level1  = coef[blockpos];<br>
+        sign1   = (level1 < 0 ? -1 : 1);<br>
+<br>
+        __int64 tmplevel1 = (__int64)abs(level1) * quantCoeff[blockpos];<br></blockquote><div><br></div></div><div>use uint64_t here.  __int64 is a Microsoft data type.  Is 64bits really necessary?</div><div class="im">
<div> </div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left-width:1px;border-left-color:rgb(204,204,204);border-left-style:solid;padding-left:1ex">

+        arlCCoef[blockpos] = (int)((tmplevel1 + addc) >> qBitsC);<br>
+        level1 = (int)((tmplevel1 + add) >> qBits);<br>
+        deltaU[blockpos] = (int)((tmplevel1 - (level1 << qBits)) >> qBits8);<br>
+        blockpos++;<br></blockquote><div><br></div></div><div>This still doesn't look optimized at all, it's just unrolled.</div><div><div class="h5"><div> </div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left-width:1px;border-left-color:rgb(204,204,204);border-left-style:solid;padding-left:1ex">


+        int level2;<br>
+        int  sign2;<br>
+        level2  = coef[blockpos];<br>
+        sign2   = (level2 < 0 ? -1 : 1);<br>
+<br>
+        __int64 tmplevel2 = (__int64)abs(level2) * quantCoeff[blockpos];<br>
+        arlCCoef[blockpos] = (int)((tmplevel2 + addc) >> qBitsC);<br>
+        level2 = (int)((tmplevel2 + add) >> qBits);<br>
+        deltaU[blockpos] = (int)((tmplevel2 - (level2 << qBits)) >> qBits8);<br>
+        blockpos++;<br>
+<br>
+        int level3;<br>
+        int  sign3;<br>
+        level3  = coef[blockpos];<br>
+        sign3   = (level3 < 0 ? -1 : 1);<br>
+<br>
+        __int64 tmplevel3 = (__int64)abs(level3) * quantCoeff[blockpos];<br>
+        arlCCoef[blockpos] = (int)((tmplevel3 + addc) >> qBitsC);<br>
+        level3 = (int)((tmplevel3 + add) >> qBits);<br>
+        deltaU[blockpos] = (int)((tmplevel3 - (level3 << qBits)) >> qBits8);<br>
+        blockpos++;<br>
+<br>
+        int level4;<br>
+        int  sign4;<br>
+        level4  = coef[blockpos];<br>
+        sign4   = (level4 < 0 ? -1 : 1);<br>
+<br>
+        __int64 tmplevel4 = (__int64)abs(level4) * quantCoeff[blockpos];<br>
+        arlCCoef[blockpos] = (int)((tmplevel4 + addc) >> qBitsC);<br>
+        level4 = (int)((tmplevel4 + add) >> qBits);<br>
+        deltaU[blockpos] = (int)((tmplevel4 - (level4 << qBits)) >> qBits8);<br>
+        blockpos++;<br>
+<br>
+        Vec4i qLevel1(level1, level2, level3, level4);<br>
+        Vec4i qSign1(sign1, sign2, sign3, sign4);<br>
+        acSum += horizontal_add(qLevel1);<br>
+        qLevel1 = qLevel1 * qSign1;<br>
+<br>
+        int level5;<br>
+        int  sign5;<br>
+        level5  = coef[blockpos];<br>
+        sign5   = (level5 < 0 ? -1 : 1);<br>
+<br>
+        __int64 tmplevel5 = (__int64)abs(level5) * quantCoeff[blockpos];<br>
+        arlCCoef[blockpos] = (int)((tmplevel5 + addc) >> qBitsC);<br>
+        level5 = (int)((tmplevel5 + add) >> qBits);<br>
+        deltaU[blockpos] = (int)((tmplevel5 - (level5 << qBits)) >> qBits8);<br>
+        blockpos++;<br>
+<br>
+        int level6;<br>
+        int  sign6;<br>
+        level6  = coef[blockpos];<br>
+        sign6   = (level6 < 0 ? -1 : 1);<br>
+<br>
+        __int64 tmplevel6 = (__int64)abs(level6) * quantCoeff[blockpos];<br>
+        arlCCoef[blockpos] = (int)((tmplevel6 + addc) >> qBitsC);<br>
+        level6 = (int)((tmplevel6 + add) >> qBits);<br>
+        deltaU[blockpos] = (int)((tmplevel6 - (level6 << qBits)) >> qBits8);<br>
+        blockpos++;<br>
+<br>
+        int level7;<br>
+        int  sign7;<br>
+        level7  = coef[blockpos];<br>
+        sign7   = (level7 < 0 ? -1 : 1);<br>
+<br>
+        __int64 tmplevel7 = (__int64)abs(level7) * quantCoeff[blockpos];<br>
+        arlCCoef[blockpos] = (int)((tmplevel7 + addc) >> qBitsC);<br>
+        level7 = (int)((tmplevel7 + add) >> qBits);<br>
+        deltaU[blockpos] = (int)((tmplevel7 - (level7 << qBits)) >> qBits8);<br>
+        blockpos++;<br>
+<br>
+        int level8;<br>
+        int  sign8;<br>
+        level8  = coef[blockpos];<br>
+        sign8   = (level8 < 0 ? -1 : 1);<br>
+<br>
+        __int64 tmplevel8 = (__int64)abs(level8) * quantCoeff[blockpos];<br>
+        arlCCoef[blockpos] = (int)((tmplevel8 + addc) >> qBitsC);<br>
+        level8 = (int)((tmplevel8 + add) >> qBits);<br>
+        deltaU[blockpos] = (int)((tmplevel8 - (level8 << qBits)) >> qBits8);<br>
+<br>
+        Vec4i qLevel2(level5, level6, level7, level8);<br>
+        Vec4i qSign2(sign5, sign6, sign7, sign8);<br>
+        acSum += horizontal_add(qLevel2);<br>
+        qLevel2 = qLevel2 * qSign2;<br>
+        Vec8s quantCoef = compress_saturated(qLevel1, qLevel2);<br>
+        Vec4i quantCoef1 = extend_low(quantCoef);<br>
+        Vec4i quantCoef2 = extend_high(quantCoef);<br>
+        quantCoef1.store(qCoef + dstOffset);<br>
+        dstOffset += 4;<br>
+        quantCoef2.store(qCoef + dstOffset);<br>
+        dstOffset += 4;<br>
+    }<br>
+<br>
+    return acSum;<br>
+}<br>
 }<br>
<br>
 #include "utils.h"<br>
@@ -3948,6 +4072,7 @@<br>
 void NAME(Setup_Vec_DCTPrimitives)(EncoderPrimitives &p)<br>
 {<br>
     p.deQuant = xDeQuant;<br>
+    p.calQuantCoefEAdp = xCalQuantCoefEAdp;<br></blockquote><div><br></div></div></div><div>this primitive is now just called quantaq</div><div> </div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left-width:1px;border-left-color:rgb(204,204,204);border-left-style:solid;padding-left:1ex">
<div class="im">

<br>
     // TODO: in 16bpp mode, the intermediate must be 32-bits<br>
 #if !HIGH_BIT_DEPTH && INSTRSET > 4<br></div>
_______________________________________________<br>
x265-devel mailing list<br>
<a href="mailto:x265-devel@videolan.org" target="_blank">x265-devel@videolan.org</a><br>
<a href="http://mailman.videolan.org/listinfo/x265-devel" target="_blank">http://mailman.videolan.org/listinfo/x265-devel</a><span class="HOEnZb"><font color="#888888"><br>
</font></span></blockquote></div><span class="HOEnZb"><font color="#888888"><br><br clear="all"><div><br></div>-- <br>Steve Borho
</font></span></div></div>
<br>_______________________________________________<br>
x265-devel mailing list<br>
<a href="mailto:x265-devel@videolan.org">x265-devel@videolan.org</a><br>
<a href="http://mailman.videolan.org/listinfo/x265-devel" target="_blank">http://mailman.videolan.org/listinfo/x265-devel</a><br>
<br></blockquote></div><br></div>