<div dir="ltr"><br><div class="gmail_extra"><br><br><div class="gmail_quote">On Wed, Jul 17, 2013 at 2:36 AM,  <span dir="ltr"><<a href="mailto:gopu@multicorewareinc.com" target="_blank">gopu@multicorewareinc.com</a>></span> wrote:<br>

<blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"># HG changeset patch<br>

# User ggopu<br>

# Date 1374046566 -19800<br>

# Node ID c6c045e9272798ce5268643436759db22fd1950b<br>

# Parent  054d8c409569100c4aacb015ffb1b3281100d993<br>

TShortYUV : Implemented perfomance Primitives pixeladd (Clipadd)<br>

<br>

diff -r 054d8c409569 -r c6c045e92727 source/common/TShortYUV.cpp<br>

--- a/source/common/TShortYUV.cpp       Wed Jul 17 11:33:20 2013 +0530<br>

+++ b/source/common/TShortYUV.cpp       Wed Jul 17 13:06:06 2013 +0530<br>

@@ -122,61 +122,38 @@<br>

 #pragma warning (disable: 4244)<br>

 #endif<br>

<br>

-void TShortYUV::addClipLuma(TShortYUV* pcYuvSrc0, TShortYUV* pcYuvSrc1, unsigned int uiTrUnitIdx, unsigned int uiPartSize)<br>

+void TShortYUV::addClipLuma(TShortYUV* pcYuvSrc0, TShortYUV* pcYuvSrc1, unsigned int trUnitIdx, unsigned int partSize)<br></blockquote><div><br></div><div>pcYuvSrc0 -> srcYuv0, pcYuvSrc1 -> srcYuv1</div><div><br></div>

<div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">

 {<br>

-    int x, y;<br>

+    int x = partSize, y = partSize;<br></blockquote><div><br></div><div>only need one var here (in the other places too)</div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">


<br>

-    short* pSrc0 = pcYuvSrc0->getLumaAddr(uiTrUnitIdx, uiPartSize);<br>

-    short* pSrc1 = pcYuvSrc1->getLumaAddr(uiTrUnitIdx, uiPartSize);<br>

-    short* pDst  = getLumaAddr(uiTrUnitIdx, uiPartSize);<br>

+    short* src0 = pcYuvSrc0->getLumaAddr(trUnitIdx, partSize);<br>

+    short* src1 = pcYuvSrc1->getLumaAddr(trUnitIdx, partSize);<br>

+    short* dst  = getLumaAddr(trUnitIdx, partSize);<br></blockquote><div><br></div><div>TShortYUV will probably need to be changed to UShort soon (globally) because I am going to change 16bpp Pel to UShort to remove the impedence mismatch with x264's pixel type.</div>

<div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">

-    unsigned int iSrc0Stride = pcYuvSrc0->width;<br>

-    unsigned int iSrc1Stride = pcYuvSrc1->width;<br>

-    unsigned int iDstStride  = width;<br>

+    unsigned int src0Stride = pcYuvSrc0->width;<br>

+    unsigned int src1Stride = pcYuvSrc1->width;<br>

+    unsigned int dstStride  = width;<br></blockquote><div><br></div><div>To be pedantic, these should be intptr_t</div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">


<br>

-    for (y = uiPartSize - 1; y >= 0; y--)<br>

-    {<br>

-        for (x = uiPartSize - 1; x >= 0; x--)<br>

-        {<br>

-            pDst[x] = ClipY(pSrc0[x] + pSrc1[x]);<br>

-        }<br>

-<br>

-        pSrc0 += iSrc0Stride;<br>

-        pSrc1 += iSrc1Stride;<br>

-        pDst  += iDstStride;<br>

-    }<br>

+    primitives.pixeladd(x, y, dst, dstStride, src0, src1, src0Stride, src1Stride);<br>

 }<br>

<br>

-void TShortYUV::addClipChroma(TShortYUV* pcYuvSrc0, TShortYUV* pcYuvSrc1, unsigned int uiTrUnitIdx, unsigned int uiPartSize)<br>

+void TShortYUV::addClipChroma(TShortYUV* pcYuvSrc0, TShortYUV* pcYuvSrc1, unsigned int trUnitIdx, unsigned int partSize)<br>

 {<br>

-    int x, y;<br>

+    int x = partSize, y = partSize;<br>

<br>

-    short* pSrcU0 = pcYuvSrc0->getCbAddr(uiTrUnitIdx, uiPartSize);<br>

-    short* pSrcU1 = pcYuvSrc1->getCbAddr(uiTrUnitIdx, uiPartSize);<br>

-    short* pSrcV0 = pcYuvSrc0->getCrAddr(uiTrUnitIdx, uiPartSize);<br>

-    short* pSrcV1 = pcYuvSrc1->getCrAddr(uiTrUnitIdx, uiPartSize);<br>

-    short* pDstU = getCbAddr(uiTrUnitIdx, uiPartSize);<br>

-    short* pDstV = getCrAddr(uiTrUnitIdx, uiPartSize);<br>

+    short* srcU0 = pcYuvSrc0->getCbAddr(trUnitIdx, partSize);<br>

+    short* srcU1 = pcYuvSrc1->getCbAddr(trUnitIdx, partSize);<br>

+    short* srcV0 = pcYuvSrc0->getCrAddr(trUnitIdx, partSize);<br>

+    short* srcV1 = pcYuvSrc1->getCrAddr(trUnitIdx, partSize);<br>

+    short* dstU = getCbAddr(trUnitIdx, partSize);<br>

+    short* dstV = getCrAddr(trUnitIdx, partSize);<br>

<br>

-    unsigned int  iSrc0Stride = pcYuvSrc0->Cwidth;<br>

-    unsigned int  iSrc1Stride = pcYuvSrc1->Cwidth;<br>

-    unsigned int  iDstStride  = Cwidth;<br>

+    unsigned int  src0Stride = pcYuvSrc0->Cwidth;<br>

+    unsigned int  src1Stride = pcYuvSrc1->Cwidth;<br>

+    unsigned int  dstStride  = Cwidth;<br>

<br>

-    for (y = uiPartSize - 1; y >= 0; y--)<br>

-    {<br>

-        for (x = uiPartSize - 1; x >= 0; x--)<br>

-        {<br>

-            pDstU[x] = ClipC(pSrcU0[x] + pSrcU1[x]);<br>

-            pDstV[x] = ClipC(pSrcV0[x] + pSrcV1[x]);<br>

-        }<br>

-<br>

-        pSrcU0 += iSrc0Stride;<br>

-        pSrcU1 += iSrc1Stride;<br>

-        pSrcV0 += iSrc0Stride;<br>

-        pSrcV1 += iSrc1Stride;<br>

-        pDstU  += iDstStride;<br>

-        pDstV  += iDstStride;<br>

-    }<br>

+    primitives.pixeladd(x, y, dstU, dstStride, srcU0, srcU1, src0Stride, src1Stride);<br>

+    primitives.pixeladd(x, y, dstV, dstStride, srcV0, srcV1, src0Stride, src1Stride);<br>

 }<br>

<br>

 #if _MSC_VER<br>

diff -r 054d8c409569 -r c6c045e92727 source/common/pixel.cpp<br>

--- a/source/common/pixel.cpp   Wed Jul 17 11:33:20 2013 +0530<br>

+++ b/source/common/pixel.cpp   Wed Jul 17 13:06:06 2013 +0530<br>

@@ -30,7 +30,6 @@<br>

 #include <algorithm><br>

 #include <stdlib.h> // abs()<br></blockquote><div><br></div><div>unrelated white-space changes, should be in a separate patch</div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">


-<br>

 #define SET_FUNC_PRIMITIVE_TABLE_C_SUBSET(WIDTH, FUNC_PREFIX, FUNC_PREFIX_DEF, FUNC_TYPE_CAST, DATA_TYPE1, DATA_TYPE2) \<br>

     p.FUNC_PREFIX[PARTITION_ ## WIDTH ## x4]   = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<WIDTH, 4,  DATA_TYPE1, DATA_TYPE2>;  \<br>

     p.FUNC_PREFIX[PARTITION_ ## WIDTH ## x8]   = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<WIDTH, 8,  DATA_TYPE1, DATA_TYPE2>;  \<br>

@@ -504,14 +503,15 @@<br>

 void weightUnidir(short *src, pixel *dst, int srcStride, int dstStride, int width, int height, int w0, int round, int shift, int offset, int bitDepth)<br>

 {<br>

     int x, y;<br></blockquote><div><br></div><div>ditto</div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">

+<br>

     for (y = height - 1; y >= 0; y--)<br>

     {<br>

         for (x = width - 1; x >= 0; )<br>

         {<br>

             // note: luma min width is 4<br>

-            dst[x] = (pixel) Clip3(0, ((1 << bitDepth) - 1), ((w0 * (src[x] + IF_INTERNAL_OFFS) + round) >> shift) + offset);<br>

+            dst[x] = (pixel)Clip3(0, ((1 << bitDepth) - 1), ((w0 * (src[x] + IF_INTERNAL_OFFS) + round) >> shift) + offset);<br>

             x--;<br>

-            dst[x] = (pixel) Clip3(0, ((1 << bitDepth) - 1), ((w0 * (src[x] + IF_INTERNAL_OFFS) + round) >> shift) + offset);<br>

+            dst[x] = (pixel)Clip3(0, ((1 << bitDepth) - 1), ((w0 * (src[x] + IF_INTERNAL_OFFS) + round) >> shift) + offset);<br>

             x--;<br>

         }<br>

<br>

@@ -535,6 +535,20 @@<br>

     }<br>

 }<br></blockquote><div><br></div><div>non-templated C primitives should end with _c</div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">

+void pixeladd_ss(int bx, int by, short *a, intptr_t dstride, short *b0, short *b1, intptr_t sstride0, intptr_t sstride1)<br>

+{<br>

+    for (int y = 0; y < by; y++)<br>

+    {<br>

+        for (int x = 0; x < bx; x++)<br>

+        {<br>

+            a[x] = (short)ClipY(b0[x] + b1[x]);<br>

+        }<br>

+<br>

+        b0 += sstride0;<br>

+        b1 += sstride1;<br>

+        a += dstride;<br>

+    }<br>

+}<br>

 }  // end anonymous namespace<br>

<br>

 namespace x265 {<br>

@@ -738,5 +752,6 @@<br>

     p.weightpUni = weightUnidir;<br>

<br>

     p.pixelsubsp = pixelsub_sp;<br>

+    p.pixeladd   = pixeladd_ss;<br></blockquote><div><br></div><div>if the function requires shorts, presumably there is a TComPiicYuv which does the same operation using uint8_t, so we will want a version that supports 8bit pixels, so the primitive funcdef should be named p.pixeladd_pp, and this particular version should only be used for high-bit-depth compiles.</div>

<div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">

 }<br>

 }<br>

diff -r 054d8c409569 -r c6c045e92727 source/common/primitives.h<br>

--- a/source/common/primitives.h        Wed Jul 17 11:33:20 2013 +0530<br>

+++ b/source/common/primitives.h        Wed Jul 17 13:06:06 2013 +0530<br>

@@ -215,7 +215,8 @@<br>

 typedef void (*dequant_t)(int bitDepth, const int* src, int* dst, int width, int height, int mcqp_miper, int mcqp_mirem, bool useScalingList, unsigned int trSizeLog2, int *dequantCoef);<br>

 typedef uint32_t (*quant_t)(int *coef, int *quantCoeff, int *deltaU, int *qCoef, int qBits, int add, int numCoeff);<br>

 typedef void (*weightpUni_t)(short *src, pixel *dst, int srcStride, int dstStride, int width, int height, int w0, int round, int shift, int offset, int bitDepth);<br>

-typedef void (*pixelsub_sp_t)(int bx, int by, short *dst, intptr_t dstride, pixel *src0, pixel *src1, intptr_t sstride0, intptr_t sstride1);<br>

+typedef void (*pixelsub_sp_t)(int bx, int by, short *dst, intptr_t dstride, pixel *src0, pixel *src1, intptr_t sstride0, intptr_t sstride1);<br>

+typedef void (*pixeladd_t)(int bx, int by, short *dst, intptr_t dstride, short *src0, short *src1, intptr_t sstride0, intptr_t sstride1);<br>

<br>

 /* Define a structure containing function pointers to optimized encoder<br>

  * primitives.  Each pointer can reference either an assembly routine,<br>

@@ -267,6 +268,7 @@<br>

<br>

     weightpUni_t    weightpUni;<br>

     pixelsub_sp_t   pixelsubsp;<br>

+    pixeladd_t      pixeladd;<br>

 };<br>

<br>

 /* This copy of the table is what gets used by the encoder.<br>

diff -r 054d8c409569 -r c6c045e92727 source/common/vec/blockcopy.inc<br>

--- a/source/common/vec/blockcopy.inc   Wed Jul 17 11:33:20 2013 +0530<br>

+++ b/source/common/vec/blockcopy.inc   Wed Jul 17 13:06:06 2013 +0530<br>

@@ -79,7 +79,7 @@<br>

         }<br>

     }<br>

     else<br></blockquote><div><br></div><div>unrelated changes</div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">

-#endif<br>

+#endif /* if INSTRSET >= 8 */<br>

     if (!(aligncheck & 15))<br>

     {<br>

         // fast path, multiples of 16 pixel wide blocks<br>

@@ -131,7 +131,7 @@<br>

         }<br>

     }<br>

     else<br>

-#endif<br>

+#endif /* if INSTRSET >= 8 && 0 */<br>

     if (!(aligncheck & 15))<br>

     {<br>

         // fast path, multiples of 16 pixel wide blocks<br>

@@ -170,6 +170,7 @@<br>

 void blockcopy_s_p(int bx, int by, short *dst, intptr_t dstride, uint8_t *src, intptr_t sstride)<br>

 {<br>

     size_t aligncheck = (size_t)dst | (size_t)src | bx | sstride | dstride;<br>

+<br>

 #if INSTRSET >= 8 && 0<br>

     if (!(aligncheck & 31))<br>

     {<br>

@@ -189,7 +190,7 @@<br>

         }<br>

     }<br>

     else<br>

-#endif<br>

+#endif /* if INSTRSET >= 8 && 0 */<br>

     if (!(aligncheck & 15))<br>

     {<br>

         // fast path, multiples of 16 pixel wide blocks<br>

@@ -292,6 +293,84 @@<br>

     }<br>

 }<br>

<br>

+void pixeladd_ss(int bx, int by, short *dst, intptr_t dstride, short *src0, short *src1, intptr_t sstride0, intptr_t sstride1)<br>

+{<br>

+    size_t aligncheck = (size_t)dst | (size_t)src0 | bx | sstride0 | dstride;<br></blockquote><div><br></div><div>alignment restriction for shorts are more complicated.  the pointers and strides must be 32 byte aligned for AVX2 but the width only needs to be a multiple of 16</div>

<div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">

+<br>

+#if INSTRSET >= 8 && 0<br>

+    if (!(aligncheck & 31))<br>

+    {<br>

+        // fast path, multiples of 32 pixel wide blocks<br>

+        // fast path, multiples of 16 pixel wide blocks<br></blockquote><div><br></div><div>?</div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">

+        for (int y = 0; y < by; y++)<br>

+        {<br>

+            for (int x = 0; x < bx; x += 8)<br>

+            {<br>

+                Vec8s vecsrc0, vecsrc1, vecsum;<br>

+                Vec8s zero(0), maxval((1 << 8) - 1); // Currently g_bitDepthY = 8 and g_bitDepthC = 8<br></blockquote><div><br></div><div>let's not add another bomb that awaits us when we want to use larger bit-depths.  either use g_bitDepthY directly here or pass in the bit-d</div>

<div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">

+                vecsrc0.load_a(src0 + x);<br>

+                vecsrc1.load_a(src1 + x);<br>

+<br>

+                vecsum = vecsrc0 + vecsrc1;<br>

+                vecsum = max(vecsum, zero);<br>

+                vecsum = min(vecsum, maxval);<br>

+<br>

+                vecsum.store(dst + x);<br>

+            }<br>

+<br>

+            src0 += sstride0;<br>

+            src1 += sstride1;<br>

+            dst += dstride;<br>

+        }<br>

+    }<br>

+    else<br>

+#endif /* if INSTRSET >= 8 && 0 */<br>

+    if (!(aligncheck & 15))<br>

+    {<br>

+        // fast path, multiples of 16 pixel wide blocks<br>

+        for (int y = 0; y < by; y++)<br>

+        {<br>

+            for (int x = 0; x < bx; x += 8)<br>

+            {<br>

+                Vec8s vecsrc0, vecsrc1, vecsum;<br>

+                Vec8s zero(0), maxval((1 << 8) - 1); // Currently g_bitDepthY = 8 and g_bitDepthC = 8<br>

+                vecsrc0.load_a(src0 + x);<br>

+                vecsrc1.load_a(src1 + x);<br>

+<br>

+                vecsum = add_saturated(vecsrc0, vecsrc1);<br>

+                vecsum = max(vecsum, zero);<br>

+                vecsum = min(vecsum, maxval);<br>

+<br>

+                vecsum.store(dst + x);<br>

+            }<br>

+<br>

+            src0 += sstride0;<br>

+            src1 += sstride1;<br>

+            dst += dstride;<br>

+        }<br>

+    }<br>

+    else<br>

+    {<br>

+        int tmp;<br>

+        int max = (1 << 8) - 1;<br>

+        // slow path, irregular memory alignments or sizes<br>

+        for (int y = 0; y < by; y++)<br>

+        {<br>

+            for (int x = 0; x < bx; x++)<br>

+            {<br>

+                tmp = src0[x] + src1[x];<br>

+                tmp = tmp < 0 ? 0 : tmp;<br>

+                tmp = tmp > max ? max : tmp;<br>

+                dst[x] = (short)tmp;<br>

+            }<br>

+<br>

+            src0 += sstride0;<br>

+            src1 += sstride1;<br>

+            dst += dstride;<br>

+        }<br>

+    }<br>

+}<br>

+<br>

 void Setup_Vec_BlockCopyPrimitives(EncoderPrimitives &p)<br>

 {<br>

 #if HIGH_BIT_DEPTH<br>

@@ -307,5 +386,6 @@<br>

     p.blockcpy_sp = blockcopy_s_p;<br>

     p.blockcpy_sc = blockcopy_s_p;<br>

     p.pixelsubsp = pixelsub_sp;<br>

-#endif<br>

+    p.pixeladd = pixeladd_ss;<br>

+#endif /* if HIGH_BIT_DEPTH */<br>

 }<br>

diff -r 054d8c409569 -r c6c045e92727 source/test/pixelharness.cpp<br>

--- a/source/test/pixelharness.cpp      Wed Jul 17 11:33:20 2013 +0530<br>

+++ b/source/test/pixelharness.cpp      Wed Jul 17 13:06:06 2013 +0530<br>

@@ -376,6 +376,29 @@<br>

     return true;<br>

 }<br>

<br>

+bool PixelHarness::check_pixeladd_sp(x265::pixeladd_t ref, x265::pixeladd_t opt)<br>

+{<br>

+    ALIGN_VAR_16(short, ref_dest[64 * 64]);<br>

+    ALIGN_VAR_16(short, opt_dest[64 * 64]);<br>

+    int bx = 64;<br>

+    int by = 64;<br>

+    int j = 0;<br>

+    for (int i = 0; i <= 100; i++)<br>

+    {<br>

+        opt(bx, by, opt_dest, 64, (short*)pbuf2 + j, (short*)pbuf1 + j, 128, 128);<br>

+        ref(bx, by, ref_dest, 64, (short*)pbuf2 + j, (short*)pbuf1 + j, 128, 128);<br>

+<br>

+        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(short)))<br>

+            return false;<br>

+<br>

+        j += 4;<br>

+        bx = 4 * ((rand() & 15) + 1);<br>

+        by = 4 * ((rand() & 15) + 1);<br>

+    }<br>

+<br>

+    return true;<br>

+}<br>

+<br>

 bool PixelHarness::testCorrectness(const EncoderPrimitives& ref, const EncoderPrimitives& opt)<br>

 {<br>

     for (uint16_t curpar = 0; curpar < NUM_PARTITIONS; curpar++)<br>

@@ -535,6 +558,14 @@<br>

         }<br>

     }<br>

<br>

+    if (opt.pixeladd)<br>

+    {<br>

+        if (!check_pixeladd_sp(ref.pixeladd, opt.pixeladd))<br>

+        {<br>

+            printf("Pixel Add failed!\n");<br></blockquote><div><br></div><div>upper case here not necessary</div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">


+            return false;<br>

+        }<br>

+    }<br>

     return true;<br>

 }<br>

<br>

@@ -649,4 +680,10 @@<br>

         printf("Pixel Sub");<br></blockquote><div><br></div><div>upper case here not necessary</div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">


         REPORT_SPEEDUP(opt.pixelsubsp, ref.pixelsubsp, 64, 64, (short*)pbuf1, FENC_STRIDE, pbuf2, pbuf1, STRIDE, STRIDE);<br>

     }<br>

+<br>

+    if (opt.pixeladd)<br>

+    {<br>

+        printf("Pixel Add");<br>

+        REPORT_SPEEDUP(opt.pixeladd, ref.pixeladd, 64, 64, (short*)pbuf1, FENC_STRIDE, (short*)pbuf2, (short*)pbuf1, STRIDE, STRIDE);<br>

+    }<br>

 }<br>

diff -r 054d8c409569 -r c6c045e92727 source/test/pixelharness.h<br>

--- a/source/test/pixelharness.h        Wed Jul 17 11:33:20 2013 +0530<br>

+++ b/source/test/pixelharness.h        Wed Jul 17 13:06:06 2013 +0530<br>

@@ -48,6 +48,7 @@<br>

     bool check_calcrecon(x265::calcrecon_t ref, x265::calcrecon_t opt);<br>

     bool check_weightpUni(x265::weightpUni_t ref, x265::weightpUni_t opt);<br>

     bool check_pixelsub_sp(x265::pixelsub_sp_t ref, x265::pixelsub_sp_t opt);<br>

+    bool check_pixeladd_sp(x265::pixeladd_t ref, x265::pixeladd_t opt);<br>

<br>

 public:<br>

<br>

_______________________________________________<br>

x265-devel mailing list<br>

<a href="mailto:x265-devel@videolan.org">x265-devel@videolan.org</a><br>

<a href="http://mailman.videolan.org/listinfo/x265-devel" target="_blank">http://mailman.videolan.org/listinfo/x265-devel</a><br>

</blockquote></div><br><br clear="all"><div><br></div>-- <br>Steve Borho

</div></div>