<div dir="ltr"><br><div class="gmail_extra"><br><br><div class="gmail_quote">On Tue, Jul 16, 2013 at 7:10 AM,  <span dir="ltr"><<a href="mailto:gopu@multicorewareinc.com" target="_blank">gopu@multicorewareinc.com</a>></span> wrote:<br>

<blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"># HG changeset patch<br>

# User ggopu<br>

# Date 1373976605 -19800<br>

# Node ID be5257d512becc658a3c0f1d7a0a7defd0c911af<br>

# Parent  c9bb72e8cb8effc0d1d0e99f0b9abc8d341c652a<br>

TShortYuv : Performance Primitives for Luma and Chroma Subtracting<br>

<br>

diff -r c9bb72e8cb8e -r be5257d512be source/common/TShortYUV.cpp<br>

--- a/source/common/TShortYUV.cpp       Mon Jul 15 23:41:11 2013 -0500<br>

+++ b/source/common/TShortYUV.cpp       Tue Jul 16 17:40:05 2013 +0530<br>

@@ -30,6 +30,8 @@<br>

 #include "TShortYUV.h"<br>

 #include "TLibCommon/TComYuv.h"<br>

<br>

+using namespace x265;<br>

+<br>

 TShortYUV::TShortYUV()<br>

 {<br>

     YBuf = NULL;<br>

@@ -76,61 +78,37 @@<br>

     subtractChroma(pcYuvSrc0, pcYuvSrc1,  uiTrUnitIdx, uiPartSize >> 1);<br>

 }<br>

<br>

-void TShortYUV::subtractLuma(TComYuv* pcYuvSrc0, TComYuv* pcYuvSrc1, unsigned int uiTrUnitIdx, unsigned int uiPartSize)<br>

+void TShortYUV::subtractLuma(TComYuv* pcYuvSrc0, TComYuv* pcYuvSrc1, unsigned int trUnitIdx, unsigned int partSize)<br>

 {<br>

-    int x, y;<br>

+    int x = partSize, y = partSize;<br>

<br>

-    Pel* pSrc0 = pcYuvSrc0->getLumaAddr(uiTrUnitIdx, uiPartSize);<br>

-    Pel* pSrc1 = pcYuvSrc1->getLumaAddr(uiTrUnitIdx, uiPartSize);<br>

-    Short* pDst  = getLumaAddr(uiTrUnitIdx, uiPartSize);<br>

+    Pel* src0 = pcYuvSrc0->getLumaAddr(trUnitIdx, partSize);<br>

+    Pel* src1 = pcYuvSrc1->getLumaAddr(trUnitIdx, partSize);<br>

+    Short* dst  = getLumaAddr(trUnitIdx, partSize);<br>

<br>

-    int  iSrc0Stride = pcYuvSrc0->getStride();<br>

-    int  iSrc1Stride = pcYuvSrc1->getStride();<br>

-    int  iDstStride  = width;<br>

+    int  src0Stride = pcYuvSrc0->getStride();<br>

+    int  src1Stride = pcYuvSrc1->getStride();<br>

+    int  dstStride  = width;<br>

<br>

-    for (y = uiPartSize - 1; y >= 0; y--)<br>

-    {<br>

-        for (x = uiPartSize - 1; x >= 0; x--)<br>

-        {<br>

-            pDst[x] = static_cast<short>(pSrc0[x]) - static_cast<short>(pSrc1[x]);<br>

-        }<br>

-<br>

-        pSrc0 += iSrc0Stride;<br>

-        pSrc1 += iSrc1Stride;<br>

-        pDst  += iDstStride;<br>

-    }<br>

+    primitives.LumaSubstract_sp(x, y, dst, dstStride, src0, src1, src0Stride, src1Stride);<br>

 }<br>

<br>

-void TShortYUV::subtractChroma(TComYuv* pcYuvSrc0, TComYuv* pcYuvSrc1, unsigned int uiTrUnitIdx, unsigned int uiPartSize)<br>

+void TShortYUV::subtractChroma(TComYuv* pcYuvSrc0, TComYuv* pcYuvSrc1, unsigned int trUnitIdx, unsigned int partSize)<br>

 {<br>

-    int x, y;<br>

+    int x = partSize, y = partSize;<br>

<br>

-    Pel* pSrcU0 = pcYuvSrc0->getCbAddr(uiTrUnitIdx, uiPartSize);<br>

-    Pel* pSrcU1 = pcYuvSrc1->getCbAddr(uiTrUnitIdx, uiPartSize);<br>

-    Pel* pSrcV0 = pcYuvSrc0->getCrAddr(uiTrUnitIdx, uiPartSize);<br>

-    Pel* pSrcV1 = pcYuvSrc1->getCrAddr(uiTrUnitIdx, uiPartSize);<br>

-    Short* pDstU  = getCbAddr(uiTrUnitIdx, uiPartSize);<br>

-    Short* pDstV  = getCrAddr(uiTrUnitIdx, uiPartSize);<br>

+    Pel* srcU0 = pcYuvSrc0->getCbAddr(trUnitIdx, partSize);<br>

+    Pel* srcU1 = pcYuvSrc1->getCbAddr(trUnitIdx, partSize);<br>

+    Pel* srcV0 = pcYuvSrc0->getCrAddr(trUnitIdx, partSize);<br>

+    Pel* srcV1 = pcYuvSrc1->getCrAddr(trUnitIdx, partSize);<br>

+    Short* dstU  = getCbAddr(trUnitIdx, partSize);<br>

+    Short* dstV  = getCrAddr(trUnitIdx, partSize);<br>

<br>

-    int  iSrc0Stride = pcYuvSrc0->getCStride();<br>

-    int  iSrc1Stride = pcYuvSrc1->getCStride();<br>

-    int  iDstStride  = Cwidth;<br>

+    int  src0Stride = pcYuvSrc0->getCStride();<br>

+    int  src1Stride = pcYuvSrc1->getCStride();<br>

+    int  dstStride  = Cwidth;<br>

<br>

-    for (y = uiPartSize - 1; y >= 0; y--)<br>

-    {<br>

-        for (x = uiPartSize - 1; x >= 0; x--)<br>

-        {<br>

-            pDstU[x] = static_cast<short>(pSrcU0[x]) - static_cast<short>(pSrcU1[x]);<br>

-            pDstV[x] = static_cast<short>(pSrcV0[x]) - static_cast<short>(pSrcV1[x]);<br>

-        }<br>

-<br>

-        pSrcU0 += iSrc0Stride;<br>

-        pSrcU1 += iSrc1Stride;<br>

-        pSrcV0 += iSrc0Stride;<br>

-        pSrcV1 += iSrc1Stride;<br>

-        pDstU  += iDstStride;<br>

-        pDstV  += iDstStride;<br>

-    }<br>

+    primitives.ChromaSubstract_sp(x, y, dstU, dstStride, dstV, dstStride, srcU0, srcU1, src0Stride, src1Stride, srcV0, srcV1, src0Stride, src1Stride);<br>

 }<br></blockquote><div> </div><div>This part looks fine.</div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">

 void TShortYUV::addClip(TShortYUV* pcYuvSrc0, TShortYUV* pcYuvSrc1, unsigned int uiTrUnitIdx, unsigned int uiPartSize)<br>

diff -r c9bb72e8cb8e -r be5257d512be source/common/pixel.cpp<br>

--- a/source/common/pixel.cpp   Mon Jul 15 23:41:11 2013 -0500<br>

+++ b/source/common/pixel.cpp   Tue Jul 16 17:40:05 2013 +0530<br>

@@ -30,7 +30,6 @@<br>

 #include "TLibCommon/CommonDef.h"<br>

 #include "TLibCommon/TComPrediction.h"<br>

<br>

-<br>

 #define SET_FUNC_PRIMITIVE_TABLE_C_SUBSET(WIDTH, FUNC_PREFIX, FUNC_PREFIX_DEF, FUNC_TYPE_CAST, DATA_TYPE1, DATA_TYPE2) \<br>

     p.FUNC_PREFIX[PARTITION_ ## WIDTH ## x4]   = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<WIDTH, 4,  DATA_TYPE1, DATA_TYPE2>;  \<br>

     p.FUNC_PREFIX[PARTITION_ ## WIDTH ## x8]   = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<WIDTH, 8,  DATA_TYPE1, DATA_TYPE2>;  \<br>

@@ -388,6 +387,41 @@<br>

     }<br>

 }<br></blockquote><div><br></div><div>Why use a capital L?  this should just be lumasubtract_sp or even better pixelsub_sp_c.</div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">


+void Lumasubstract_s_p(int bx, int by, short *a, intptr_t dstride, pixel *b0, pixel *b1, intptr_t sstride0, intptr_t sstride1)<br>

+{<br>

+    for (int y = 0; y < by; y++)<br>

+    {<br>

+        for (int x = 0; x < bx; x++)<br>

+        {<br>

+            a[x] = (short)(b0[x] - b1[x]);<br>

+        }<br>

+<br>

+        b0 += sstride0;<br>

+        b1 += sstride1;<br>

+        a += dstride;<br>

+    }<br>

+}<br></blockquote><div><br></div><div>There's no need for a separate primitive for chroma, just make two calls to pixelsub_sp.  There's no efficiency gained by doing two at once.</div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">


+void Chromasubstract_s_p(int bx, int by, short *dstu, intptr_t dstsrideu, short *dstv, intptr_t dstsridev, pixel *u0, pixel *u1, intptr_t sstrideu0, intptr_t sstrideu1,<br>

+                         pixel *v0, pixel *v1, intptr_t sstridev0, intptr_t sstridev1)<br>

+{<br>

+    for (int y = 0; y < by; y++)<br>

+    {<br>

+        for (int x = 0; x < bx; x++)<br>

+        {<br>

+            dstu[x] = (short)(u0[x] - u1[x]);<br>

+            dstv[x] = (short)(v0[x] - v1[x]);<br>

+        }<br>

+<br>

+        u0 += sstrideu0;<br>

+        u1 += sstrideu1;<br>

+        v0 += sstridev0;<br>

+        v1 += sstridev1;<br>

+        dstu += dstsrideu;<br>

+        dstv += dstsridev;<br>

+    }<br>

+}<br>

+<br>

 void blockcopy_p_s(int bx, int by, pixel *a, intptr_t stridea, short *b, intptr_t strideb)<br>

 {<br>

     for (int y = 0; y < by; y++)<br>

@@ -504,14 +538,15 @@<br>

 void weightUnidir(short *src, pixel *dst, int srcStride, int dstStride, int width, int height, int w0, int round, int shift, int offset, int bitDepth)<br>

 {<br>

     int x, y;<br>

+<br>

     for (y = height - 1; y >= 0; y--)<br>

     {<br>

         for (x = width - 1; x >= 0; )<br>

         {<br>

             // note: luma min width is 4<br>

-            dst[x] = (pixel) Clip3(0, ((1 << bitDepth) - 1), ((w0 * (src[x] + IF_INTERNAL_OFFS) + round) >> shift) + offset);<br>

+            dst[x] = (pixel)Clip3(0, ((1 << bitDepth) - 1), ((w0 * (src[x] + IF_INTERNAL_OFFS) + round) >> shift) + offset);<br>

             x--;<br>

-            dst[x] = (pixel) Clip3(0, ((1 << bitDepth) - 1), ((w0 * (src[x] + IF_INTERNAL_OFFS) + round) >> shift) + offset);<br>

+            dst[x] = (pixel)Clip3(0, ((1 << bitDepth) - 1), ((w0 * (src[x] + IF_INTERNAL_OFFS) + round) >> shift) + offset);<br>

             x--;<br>

         }<br></blockquote><div><br></div><div>Unrelated changes, should be in a separate patch</div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">

<br>

@@ -519,7 +554,6 @@<br>

         dst  += dstStride;<br>

     }<br>

 }<br>

-<br>

 }  // end anonymous namespace<br>

<br>

 namespace x265 {<br>

@@ -619,6 +653,8 @@<br>

     p.blockcpy_ps = blockcopy_p_s;<br>

     p.blockcpy_sp = blockcopy_s_p;<br>

     p.blockcpy_sc = blockcopy_s_c;<br>

+    p.LumaSubstract_sp = Lumasubstract_s_p;<br>

+    p.ChromaSubstract_sp = Chromasubstract_s_p;<br>

<br>

     p.cvt16to32     = convert16to32;<br>

     p.cvt16to32_shl = convert16to32_shl;<br>

diff -r c9bb72e8cb8e -r be5257d512be source/common/primitives.h<br>

--- a/source/common/primitives.h        Mon Jul 15 23:41:11 2013 -0500<br>

+++ b/source/common/primitives.h        Tue Jul 16 17:40:05 2013 +0530<br>

@@ -192,6 +192,9 @@<br>

 typedef void (*ipfilter_s2p_t)(int bitDepth, short *src, int srcStride, pixel *dst, int dstStride, int width, int height);<br>

 typedef void (*blockcpy_pp_t)(int bx, int by, pixel *dst, intptr_t dstride, pixel *src, intptr_t sstride); // dst is aligned<br>

 typedef void (*blockcpy_sp_t)(int bx, int by, short *dst, intptr_t dstride, pixel *src, intptr_t sstride); // dst is aligned<br></blockquote><div><br></div><div>None of the other primitives have uppercase names.  Please follow conventions.  pixelsub_sp_t would be more appropriate.</div>

<div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">

+typedef void (*LumaSubstract_sp_t)(int bx, int by, short *dst, intptr_t dstride, pixel *src0, pixel *src1, intptr_t sstride0, intptr_t sstride1); // dst is aligned<br>

+typedef void (*ChromaSubstract_sp_t)(int bx, int by, short *dstu, intptr_t dstsrideu, short *dstv, intptr_t dstsridev, pixel *u0, pixel *u1, intptr_t sstrideu0, intptr_t sstrideu1,<br>

+                                     pixel *v0, pixel *v1, intptr_t sstridev0, intptr_t sstridev1);<br>

 typedef void (*blockcpy_ps_t)(int bx, int by, pixel *dst, intptr_t dstride, short *src, intptr_t sstride); // dst is aligned<br>

 typedef void (*blockcpy_sc_t)(int bx, int by, short *dst, intptr_t dstride, uint8_t *src, intptr_t sstride); // dst is aligned<br>

 typedef void (*intra_dc_t)(pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int bFilter);<br>

@@ -236,6 +239,10 @@<br>

     blockcpy_ps_t   blockcpy_ps;                // block copy pixel from short<br>

     blockcpy_sp_t   blockcpy_sp;                // block copy short from pixel<br>

     blockcpy_sc_t   blockcpy_sc;                // block copy short from unsigned char<br>

+<br>

+    LumaSubstract_sp_t LumaSubstract_sp;<br>

+    ChromaSubstract_sp_t ChromaSubstract_sp;<br>

+<br>

     cvt16to32_t     cvt16to32;<br>

     cvt16to32_shl_t cvt16to32_shl;<br>

     cvt16to16_shl_t cvt16to16_shl;<br>

diff -r c9bb72e8cb8e -r be5257d512be source/common/vec/blockcopy.inc<br>

--- a/source/common/vec/blockcopy.inc   Mon Jul 15 23:41:11 2013 -0500<br>

+++ b/source/common/vec/blockcopy.inc   Tue Jul 16 17:40:05 2013 +0530<br>

@@ -79,7 +79,7 @@<br>

         }<br>

     }<br>

     else<br></blockquote><div><br></div><div>More unrelated changes, should be in a different patch</div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">


-#endif<br>

+#endif /* if INSTRSET >= 8 */<br>

     if (!(aligncheck & 15))<br>

     {<br>

         // fast path, multiples of 16 pixel wide blocks<br>

@@ -131,7 +131,7 @@<br>

         }<br>

     }<br>

     else<br>

-#endif<br>

+#endif /* if INSTRSET >= 8 && 0 */<br>

     if (!(aligncheck & 15))<br>

     {<br>

         // fast path, multiples of 16 pixel wide blocks<br>

@@ -170,6 +170,7 @@<br>

 void blockcopy_s_p(int bx, int by, short *dst, intptr_t dstride, uint8_t *src, intptr_t sstride)<br>

 {<br>

     size_t aligncheck = (size_t)dst | (size_t)src | bx | sstride | dstride;<br>

+<br>

 #if INSTRSET >= 8 && 0<br>

     if (!(aligncheck & 31))<br>

     {<br>

@@ -189,7 +190,7 @@<br>

         }<br>

     }<br>

     else<br>

-#endif<br>

+#endif /* if INSTRSET >= 8 && 0 */<br>

     if (!(aligncheck & 15))<br>

     {<br>

         // fast path, multiples of 16 pixel wide blocks<br>

@@ -223,6 +224,173 @@<br>

     }<br>

 }<br></blockquote><div><br></div><div>use pixelsub_sp here</div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">

+void Lumasubstract_s_p(int bx, int by, short *dst, intptr_t dstride, uint8_t *src0, uint8_t *src1, intptr_t sstride0, intptr_t sstride1)<br>

+{<br>

+    size_t aligncheck = (size_t)dst | (size_t)src0 | bx | sstride0 | dstride;<br>

+<br>

+#if INSTRSET >= 8 && 0<br>

+    if (!(aligncheck & 31))<br>

+    {<br>

+        // fast path, multiples of 32 pixel wide blocks<br>

+        // fast path, multiples of 16 pixel wide blocks<br>

+        for (int y = 0; y < by; y++)<br>

+        {<br>

+            for (int x = 0; x < bx; x += 32)<br>

+            {<br>

+                Vec32uc word0, word1;<br>

+                Vec16s  word3, word4;<br>

+                word0.load_a(src0 + x);<br>

+                word1.load_a(src1 + x);<br>

+                word3 = extend_low(word0) - extend_low(word1);<br>

+                word4 = extend_high(word0) - extend_high(word1);<br>

+                word3.store_a(dst + x);<br>

+                word4.store_a(dst + x + 16);<br>

+            }<br>

+<br>

+            src0 += sstride0;<br>

+            src1 += sstride1;<br>

+            dst += dstride;<br>

+        }<br>

+    }<br>

+    else<br>

+#endif /* if INSTRSET >= 8 && 0 */<br>

+    if (!(aligncheck & 15))<br>

+    {<br>

+        // fast path, multiples of 16 pixel wide blocks<br>

+        for (int y = 0; y < by; y++)<br>

+        {<br>

+            for (int x = 0; x < bx; x += 16)<br>

+            {<br>

+                Vec16uc word0, word1;<br>

+                Vec8s word3, word4;<br>

+                word0.load_a(src0 + x);<br>

+                word1.load_a(src1 + x);<br>

+                word3 = extend_low(word0) - extend_low(word1);<br>

+                word4 = extend_high(word0) - extend_high(word1);<br>

+                word3.store_a(dst + x);<br>

+                word4.store_a(dst + x + 8);<br>

+            }<br>

+<br>

+            src0 += sstride0;<br>

+            src1 += sstride1;<br>

+            dst += dstride;<br>

+        }<br>

+    }<br>

+    else<br>

+    {<br></blockquote><div><br></div><div>The slow path should still be vectorized if bx is large enough.  It just needs to use unaligned loads and stores.  Perhaps another else if (bx >= 16) { vectorized and unaligned } clause</div>

<div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">

+        // slow path, irregular memory alignments or sizes<br>

+        for (int y = 0; y < by; y++)<br>

+        {<br>

+            for (int x = 0; x < bx; x++)<br>

+            {<br>

+                dst[x] = (short)(src0[x] - src1[x]);<br>

+            }<br>

+<br>

+            src0 += sstride0;<br>

+            src1 += sstride1;<br>

+            dst += dstride;<br>

+        }<br>

+    }<br>

+}<br></blockquote><div><br></div><div>and drop the chroma function</div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">

+void Chromasubstract_s_p(int bx, int by, short *dstu, intptr_t dstsrideu, short *dstv, intptr_t dstsridev, pixel *u0, pixel *u1, intptr_t sstrideu0, intptr_t sstrideu1,<br>

+                         pixel *v0, pixel *v1, intptr_t sstridev0, intptr_t sstridev1)<br>

+{<br>

+    size_t aligncheck = (size_t)dstu | (size_t)u0 | bx | sstrideu1 | dstsrideu;<br>

+<br>

+#if INSTRSET >= 8 && 0<br>

+    if (!(aligncheck & 31))<br>

+    {<br>

+        // fast path, multiples of 32 pixel wide blocks<br>

+        // fast path, multiples of 16 pixel wide blocks<br>

+        for (int y = 0; y < by; y++)<br>

+        {<br>

+            for (int x = 0; x < bx; x += 32)<br>

+            {<br>

+                Vec32uc uword0, uword1;<br>

+                Vec16s  uword3, uword4;<br>

+                uword0.load_a(u0 + x);<br>

+                uword1.load_a(u1 + x);<br>

+                uword3 = extend_low(uword0) - extend_low(uword1);<br>

+                uword4 = extend_high(uword0) - extend_high(uword1);<br>

+                uword3.store_a(dstu + x);<br>

+                uword4.store_a(dstu + x + 16);<br>

+<br>

+                Vec32uc vword0, vword1;<br>

+                Vec16s  vword3, vword4;<br>

+                vword0.load_a(v0 + x);<br>

+                vword1.load_a(v1 + x);<br>

+                vword3 = extend_low(vword0) - extend_low(vword1);<br>

+                vword4 = extend_high(vword0) - extend_high(vword1);<br>

+                vword3.store_a(dstv + x);<br>

+                vword4.store_a(dstv + x + 16);<br>

+            }<br>

+<br>

+            u0 += sstrideu0;<br>

+            u1 += sstrideu1;<br>

+            v0 += sstridev0;<br>

+            v1 += sstridev1;<br>

+            dstu += dstsrideu;<br>

+            dstv += dstsridev;<br>

+        }<br>

+    }<br>

+    else<br>

+#endif /* if INSTRSET >= 8 && 0 */<br>

+    if (!(aligncheck & 15))<br>

+    {<br>

+        // fast path, multiples of 16 pixel wide blocks<br>

+        for (int y = 0; y < by; y++)<br>

+        {<br>

+            for (int x = 0; x < bx; x += 16)<br>

+            {<br>

+                Vec16uc uword0, uword1;<br>

+                Vec8s uword3, uword4;<br>

+                uword0.load_a(u0 + x);<br>

+                uword1.load_a(u1 + x);<br>

+                uword3 = extend_low(uword0) - extend_low(uword1);<br>

+                uword4 = extend_high(uword0) - extend_high(uword1);<br>

+                uword3.store_a(dstu + x);<br>

+                uword4.store_a(dstu + x + 8);<br>

+<br>

+                Vec16uc vword0, vword1;<br>

+                Vec8s vword3, vword4;<br>

+                vword0.load_a(v0 + x);<br>

+                vword1.load_a(v1 + x);<br>

+                vword3 = extend_low(vword0) - extend_low(vword1);<br>

+                vword4 = extend_high(vword0) - extend_high(vword1);<br>

+                vword3.store_a(dstv + x);<br>

+                vword4.store_a(dstv + x + 8);<br>

+            }<br>

+<br>

+            u0 += sstrideu0;<br>

+            u1 += sstrideu1;<br>

+            v0 += sstridev0;<br>

+            v1 += sstridev1;<br>

+            dstu += dstsrideu;<br>

+            dstv += dstsridev;<br>

+        }<br>

+    }<br>

+    else<br>

+    {<br>

+        // slow path, irregular memory alignments or sizes<br>

+        for (int y = 0; y < by; y++)<br>

+        {<br>

+            for (int x = 0; x < bx; x++)<br>

+            {<br>

+                dstu[x] = (short)(u0[x] - u1[x]);<br>

+                dstv[x] = (short)(v0[x] - v1[x]);<br>

+            }<br>

+<br>

+            u0 += sstrideu0;<br>

+            u1 += sstrideu1;<br>

+            v0 += sstridev0;<br>

+            v1 += sstridev1;<br>

+            dstu += dstsrideu;<br>

+            dstv += dstsridev;<br>

+        }<br>

+    }<br>

+}<br>

+<br>

 void Setup_Vec_BlockCopyPrimitives(EncoderPrimitives &p)<br>

 {<br>

 #if HIGH_BIT_DEPTH<br>

@@ -231,10 +399,13 @@<br>

     p.blockcpy_ps = (x265::blockcpy_ps_t)blockcopy_p_p;<br>

     p.blockcpy_sp = (x265::blockcpy_sp_t)blockcopy_p_p;<br>

     p.blockcpy_sc = (x265::blockcpy_sc_t)blockcopy_s_p;<br>

+    p.blockcpyyuv_sp = (x265::blockcpy_sc_t)blockcopyYuv_s_p;<br>

 #else<br>

     p.blockcpy_pp = blockcopy_p_p;<br>

     p.blockcpy_ps = blockcopy_p_s;<br>

     p.blockcpy_sp = blockcopy_s_p;<br>

     p.blockcpy_sc = blockcopy_s_p;<br>

-#endif<br>

+    p.LumaSubstract_sp = Lumasubstract_s_p;<br>

+    p.ChromaSubstract_sp = Chromasubstract_s_p;<br>

+#endif /* if HIGH_BIT_DEPTH */<br>

 }<br>

diff -r c9bb72e8cb8e -r be5257d512be source/test/pixelharness.cpp<br>

--- a/source/test/pixelharness.cpp      Mon Jul 15 23:41:11 2013 -0500<br>

+++ b/source/test/pixelharness.cpp      Tue Jul 16 17:40:05 2013 +0530<br>

@@ -56,10 +56,13 @@<br>

     pbuf1 = (pixel*)TestHarness::alignedMalloc(sizeof(pixel), 64 * 64 * 32, 32);<br>

     pbuf2 = (pixel*)TestHarness::alignedMalloc(sizeof(pixel), 64 * 64 * 32, 32);<br></blockquote><div><br></div><div>pbuf3 and pbuf4 are unnecessary once you drop the chroma function</div><div> <br></div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">


+    pbuf3 = (pixel*)TestHarness::alignedMalloc(sizeof(pixel), 64 * 64 * 32, 32);<br>

+    pbuf4 = (pixel*)TestHarness::alignedMalloc(sizeof(pixel), 64 * 64 * 32, 32);<br>

+<br>

     sbuf1 = (short*)TestHarness::alignedMalloc(sizeof(short), 64 * 64 * 32, 32);<br>

     sbuf2 = (short*)TestHarness::alignedMalloc(sizeof(short), 64 * 64 * 32, 32);<br>

<br>

-    if (!pbuf1 || !pbuf2)<br>

+    if (!pbuf1 || !pbuf2 | !pbuf3 | !pbuf4)<br>

     {<br>

         fprintf(stderr, "malloc failed, unable to initiate tests!\n");<br>

         exit(1);<br>

@@ -71,6 +74,9 @@<br>

         pbuf1[i] = rand() & PIXEL_MAX;<br>

         pbuf2[i] = rand() & PIXEL_MAX;<br>

<br>

+        pbuf3[i] = rand() & PIXEL_MAX;<br>

+        pbuf4[i] = rand() & PIXEL_MAX;<br>

+<br>

         sbuf1[i] = rand() & PIXEL_MAX;<br>

         sbuf2[i] = rand() & PIXEL_MAX;<br>

     }<br>

@@ -222,6 +228,59 @@<br>

     return true;<br>

 }<br>

<br>

+bool PixelHarness::check_LumaSubstract_s_p(x265::LumaSubstract_sp_t ref, x265::LumaSubstract_sp_t opt)<br>

+{<br>

+    ALIGN_VAR_16(short, ref_dest[64 * 64]);<br>

+    ALIGN_VAR_16(short, opt_dest[64 * 64]);<br>

+    int bx = 64;<br>

+    int by = 64;<br>

+    int j = 0;<br>

+    for (int i = 0; i <= 100; i++)<br>

+    {<br>

+        opt(bx, by, opt_dest, 64, pbuf2 + j, pbuf1 + j, 128, 128);<br>

+        ref(bx, by, ref_dest, 64, pbuf2 + j, pbuf1 + j, 128, 128);<br>

+<br>

+        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(short)))<br>

+            return false;<br>

+<br>

+        j += 4;<br>

+        bx = 4 * ((rand() & 15) + 1);<br>

+        by = 4 * ((rand() & 15) + 1);<br>

+    }<br>

+<br>

+    return true;<br>

+}<br>

+<br>

+bool PixelHarness::check_ChromaSubstract_s_p(x265::ChromaSubstract_sp_t ref, x265::ChromaSubstract_sp_t opt)<br>

+{<br>

+    ALIGN_VAR_16(short, ref_destu[64 * 64]);<br>

+    ALIGN_VAR_16(short, opt_destu[64 * 64]);<br>

+<br>

+    ALIGN_VAR_16(short, ref_destv[64 * 64]);<br>

+    ALIGN_VAR_16(short, opt_destv[64 * 64]);<br>

+<br>

+    int bx = 64;<br>

+    int by = 64;<br>

+    int j = 0;<br>

+    for (int i = 0; i <= 100; i++)<br>

+    {<br>

+        opt(bx, by, opt_destu, 64, opt_destv, 64, pbuf2 + j, pbuf1 + j, 128, 128, pbuf3 + j, pbuf4 + j, 128, 128);<br>

+        ref(bx, by, ref_destu, 64, ref_destv, 64, pbuf2 + j, pbuf1 + j, 128, 128, pbuf3 + j, pbuf4 + j, 128, 128);<br>

+<br>

+        if (memcmp(ref_destu, opt_destu, 64 * 64 * sizeof(short)))<br>

+            return false;<br>

+<br>

+        if (memcmp(ref_destv, opt_destv, 64 * 64 * sizeof(short)))<br>

+            return false;<br>

+<br>

+        j += 4;<br>

+        bx = 4 * ((rand() & 15) + 1);<br>

+        by = 4 * ((rand() & 15) + 1);<br>

+    }<br>

+<br>

+    return true;<br>

+}<br>

+<br>

 bool PixelHarness::check_block_copy_s_c(x265::blockcpy_sc_t ref, x265::blockcpy_sc_t opt)<br>

 {<br>

     ALIGN_VAR_16(short, ref_dest[64 * 64]);<br>

@@ -341,14 +400,15 @@<br>

     int offset = (rand() % 256) - 128;<br>

     for (int i = 0; i <= 100; i++)<br>

     {<br>

-        opt(sbuf1+j, opt_dest, 64, 64, width, height, w0, round, shift, offset, BIT_DEPTH);<br>

-        ref(sbuf1+j, ref_dest, 64, 64, width, height, w0, round, shift, offset, BIT_DEPTH);<br>

+        opt(sbuf1 + j, opt_dest, 64, 64, width, height, w0, round, shift, offset, BIT_DEPTH);<br>

+        ref(sbuf1 + j, ref_dest, 64, 64, width, height, w0, round, shift, offset, BIT_DEPTH);<br></blockquote><div><br></div><div>unrelated changes</div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">


<br>

         if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))<br>

             return false;<br>

<br>

         j += 4;<br>

     }<br>

+<br>

     return true;<br>

 }<br>

<br>

@@ -484,6 +544,24 @@<br>

         }<br>

     }<br>

<br>

+    if (opt.LumaSubstract_sp)<br>

+    {<br>

+        if (!check_LumaSubstract_s_p(ref.LumaSubstract_sp, opt.LumaSubstract_sp))<br>

+        {<br>

+            printf("Luma Substract failed!\n");<br>

+            return false;<br>

+        }<br>

+    }<br>

+<br>

+    if (opt.ChromaSubstract_sp)<br>

+    {<br>

+        if (!check_ChromaSubstract_s_p(ref.ChromaSubstract_sp, opt.ChromaSubstract_sp))<br>

+        {<br>

+            printf("Chroma Substract failed!\n");<br>

+            return false;<br>

+        }<br>

+    }<br>

+<br>

     if (opt.blockcpy_sc)<br>

     {<br>

         if (!check_block_copy_s_c(ref.blockcpy_sc, opt.blockcpy_sc))<br>

@@ -492,7 +570,7 @@<br>

             return false;<br>

         }<br>

     }<br></blockquote><div><br></div><div>unrelated changes</div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">

-<br>

+<br>

     if (opt.weightpUni)<br>

     {<br>

         if (!check_weightpUni(ref.weightpUni, opt.weightpUni))<br>

@@ -502,7 +580,6 @@<br>

         }<br>

     }<br>

<br>

-<br>

     return true;<br>

 }<br>

<br>

@@ -600,6 +677,18 @@<br>

         REPORT_SPEEDUP(opt.blockcpy_sp, ref.blockcpy_sp, 64, 64, (short*)pbuf1, FENC_STRIDE, pbuf2, STRIDE);<br>

     }<br>

<br>

+    if (opt.LumaSubstract_sp)<br>

+    {<br>

+        printf("Luma Sub");<br>

+        REPORT_SPEEDUP(opt.LumaSubstract_sp, ref.LumaSubstract_sp, 64, 64, (short*)pbuf1, FENC_STRIDE, pbuf2, pbuf1, STRIDE, STRIDE);<br>

+    }<br>

+<br>

+    if (opt.ChromaSubstract_sp)<br>

+    {<br>

+        printf("Chroma Sub");<br>

+        REPORT_SPEEDUP(opt.ChromaSubstract_sp, ref.ChromaSubstract_sp, 64, 64, (short*)pbuf1, FENC_STRIDE, (short*)pbuf2, FENC_STRIDE,  pbuf3, pbuf3, STRIDE, STRIDE, pbuf4, pbuf4, STRIDE, STRIDE);<br>

+    }<br>

+<br>

     if (opt.blockcpy_sc)<br>

     {<br>

         printf("s_c   cpy");<br>

@@ -609,6 +698,6 @@<br>

     if (opt.weightpUni)<br>

     {<br></blockquote><div><br></div><div>unrelated changes</div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">

         printf("WeightpUni");<br>

-        REPORT_SPEEDUP(opt.weightpUni, ref.weightpUni, sbuf1, pbuf1, 64, 64, 32, 32, 128, 1<<9, 10, 100, BIT_DEPTH);<br>

+        REPORT_SPEEDUP(opt.weightpUni, ref.weightpUni, sbuf1, pbuf1, 64, 64, 32, 32, 128, 1 << 9, 10, 100, BIT_DEPTH);<br>

     }<br>

 }<br>

diff -r c9bb72e8cb8e -r be5257d512be source/test/pixelharness.h<br>

--- a/source/test/pixelharness.h        Mon Jul 15 23:41:11 2013 -0500<br>

+++ b/source/test/pixelharness.h        Tue Jul 16 17:40:05 2013 +0530<br>

@@ -31,7 +31,7 @@<br>

 {<br>

 protected:<br>

<br>

-    pixel *pbuf1, *pbuf2;<br>

+    pixel *pbuf1, *pbuf2, *pbuf3, *pbuf4;<br>

<br>

     short *sbuf1, *sbuf2;<br>

<br>

@@ -42,6 +42,8 @@<br>

     bool check_pixelcmp_x4(x265::pixelcmp_x4_t ref, x265::pixelcmp_x4_t opt);<br>

     bool check_block_copy(x265::blockcpy_pp_t ref, x265::blockcpy_pp_t opt);<br>

     bool check_block_copy_s_p(x265::blockcpy_sp_t ref, x265::blockcpy_sp_t opt);<br></blockquote><div><br></div><div>tabs?  really?</div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">


+       bool check_LumaSubstract_s_p(x265::LumaSubstract_sp_t ref, x265::LumaSubstract_sp_t opt);<br>

+       bool check_ChromaSubstract_s_p(x265::ChromaSubstract_sp_t ref, x265::ChromaSubstract_sp_t opt);<br>

     bool check_block_copy_p_s(x265::blockcpy_ps_t ref, x265::blockcpy_ps_t opt);<br>

     bool check_block_copy_s_c(x265::blockcpy_sc_t ref, x265::blockcpy_sc_t opt);<br>

     bool check_calresidual(x265::calcresidual_t ref, x265::calcresidual_t opt);<br></blockquote></div><div><br></div>-- <br>Steve Borho

</div></div>