<div dir="ltr"><br><div class="gmail_extra"><br><br><div class="gmail_quote">On Tue, Jul 16, 2013 at 7:10 AM, <span dir="ltr"><<a href="mailto:gopu@multicorewareinc.com" target="_blank">gopu@multicorewareinc.com</a>></span> wrote:<br>
<blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"># HG changeset patch<br>
# User ggopu<br>
# Date 1373976605 -19800<br>
# Node ID be5257d512becc658a3c0f1d7a0a7defd0c911af<br>
# Parent c9bb72e8cb8effc0d1d0e99f0b9abc8d341c652a<br>
TShortYuv : Performance Primitives for Luma and Chroma Subtracting<br>
<br>
diff -r c9bb72e8cb8e -r be5257d512be source/common/TShortYUV.cpp<br>
--- a/source/common/TShortYUV.cpp Mon Jul 15 23:41:11 2013 -0500<br>
+++ b/source/common/TShortYUV.cpp Tue Jul 16 17:40:05 2013 +0530<br>
@@ -30,6 +30,8 @@<br>
#include "TShortYUV.h"<br>
#include "TLibCommon/TComYuv.h"<br>
<br>
+using namespace x265;<br>
+<br>
TShortYUV::TShortYUV()<br>
{<br>
YBuf = NULL;<br>
@@ -76,61 +78,37 @@<br>
subtractChroma(pcYuvSrc0, pcYuvSrc1, uiTrUnitIdx, uiPartSize >> 1);<br>
}<br>
<br>
-void TShortYUV::subtractLuma(TComYuv* pcYuvSrc0, TComYuv* pcYuvSrc1, unsigned int uiTrUnitIdx, unsigned int uiPartSize)<br>
+void TShortYUV::subtractLuma(TComYuv* pcYuvSrc0, TComYuv* pcYuvSrc1, unsigned int trUnitIdx, unsigned int partSize)<br>
{<br>
- int x, y;<br>
+ int x = partSize, y = partSize;<br>
<br>
- Pel* pSrc0 = pcYuvSrc0->getLumaAddr(uiTrUnitIdx, uiPartSize);<br>
- Pel* pSrc1 = pcYuvSrc1->getLumaAddr(uiTrUnitIdx, uiPartSize);<br>
- Short* pDst = getLumaAddr(uiTrUnitIdx, uiPartSize);<br>
+ Pel* src0 = pcYuvSrc0->getLumaAddr(trUnitIdx, partSize);<br>
+ Pel* src1 = pcYuvSrc1->getLumaAddr(trUnitIdx, partSize);<br>
+ Short* dst = getLumaAddr(trUnitIdx, partSize);<br>
<br>
- int iSrc0Stride = pcYuvSrc0->getStride();<br>
- int iSrc1Stride = pcYuvSrc1->getStride();<br>
- int iDstStride = width;<br>
+ int src0Stride = pcYuvSrc0->getStride();<br>
+ int src1Stride = pcYuvSrc1->getStride();<br>
+ int dstStride = width;<br>
<br>
- for (y = uiPartSize - 1; y >= 0; y--)<br>
- {<br>
- for (x = uiPartSize - 1; x >= 0; x--)<br>
- {<br>
- pDst[x] = static_cast<short>(pSrc0[x]) - static_cast<short>(pSrc1[x]);<br>
- }<br>
-<br>
- pSrc0 += iSrc0Stride;<br>
- pSrc1 += iSrc1Stride;<br>
- pDst += iDstStride;<br>
- }<br>
+ primitives.LumaSubstract_sp(x, y, dst, dstStride, src0, src1, src0Stride, src1Stride);<br>
}<br>
<br>
-void TShortYUV::subtractChroma(TComYuv* pcYuvSrc0, TComYuv* pcYuvSrc1, unsigned int uiTrUnitIdx, unsigned int uiPartSize)<br>
+void TShortYUV::subtractChroma(TComYuv* pcYuvSrc0, TComYuv* pcYuvSrc1, unsigned int trUnitIdx, unsigned int partSize)<br>
{<br>
- int x, y;<br>
+ int x = partSize, y = partSize;<br>
<br>
- Pel* pSrcU0 = pcYuvSrc0->getCbAddr(uiTrUnitIdx, uiPartSize);<br>
- Pel* pSrcU1 = pcYuvSrc1->getCbAddr(uiTrUnitIdx, uiPartSize);<br>
- Pel* pSrcV0 = pcYuvSrc0->getCrAddr(uiTrUnitIdx, uiPartSize);<br>
- Pel* pSrcV1 = pcYuvSrc1->getCrAddr(uiTrUnitIdx, uiPartSize);<br>
- Short* pDstU = getCbAddr(uiTrUnitIdx, uiPartSize);<br>
- Short* pDstV = getCrAddr(uiTrUnitIdx, uiPartSize);<br>
+ Pel* srcU0 = pcYuvSrc0->getCbAddr(trUnitIdx, partSize);<br>
+ Pel* srcU1 = pcYuvSrc1->getCbAddr(trUnitIdx, partSize);<br>
+ Pel* srcV0 = pcYuvSrc0->getCrAddr(trUnitIdx, partSize);<br>
+ Pel* srcV1 = pcYuvSrc1->getCrAddr(trUnitIdx, partSize);<br>
+ Short* dstU = getCbAddr(trUnitIdx, partSize);<br>
+ Short* dstV = getCrAddr(trUnitIdx, partSize);<br>
<br>
- int iSrc0Stride = pcYuvSrc0->getCStride();<br>
- int iSrc1Stride = pcYuvSrc1->getCStride();<br>
- int iDstStride = Cwidth;<br>
+ int src0Stride = pcYuvSrc0->getCStride();<br>
+ int src1Stride = pcYuvSrc1->getCStride();<br>
+ int dstStride = Cwidth;<br>
<br>
- for (y = uiPartSize - 1; y >= 0; y--)<br>
- {<br>
- for (x = uiPartSize - 1; x >= 0; x--)<br>
- {<br>
- pDstU[x] = static_cast<short>(pSrcU0[x]) - static_cast<short>(pSrcU1[x]);<br>
- pDstV[x] = static_cast<short>(pSrcV0[x]) - static_cast<short>(pSrcV1[x]);<br>
- }<br>
-<br>
- pSrcU0 += iSrc0Stride;<br>
- pSrcU1 += iSrc1Stride;<br>
- pSrcV0 += iSrc0Stride;<br>
- pSrcV1 += iSrc1Stride;<br>
- pDstU += iDstStride;<br>
- pDstV += iDstStride;<br>
- }<br>
+ primitives.ChromaSubstract_sp(x, y, dstU, dstStride, dstV, dstStride, srcU0, srcU1, src0Stride, src1Stride, srcV0, srcV1, src0Stride, src1Stride);<br>
}<br></blockquote><div> </div><div>This part looks fine.</div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
void TShortYUV::addClip(TShortYUV* pcYuvSrc0, TShortYUV* pcYuvSrc1, unsigned int uiTrUnitIdx, unsigned int uiPartSize)<br>
diff -r c9bb72e8cb8e -r be5257d512be source/common/pixel.cpp<br>
--- a/source/common/pixel.cpp Mon Jul 15 23:41:11 2013 -0500<br>
+++ b/source/common/pixel.cpp Tue Jul 16 17:40:05 2013 +0530<br>
@@ -30,7 +30,6 @@<br>
#include "TLibCommon/CommonDef.h"<br>
#include "TLibCommon/TComPrediction.h"<br>
<br>
-<br>
#define SET_FUNC_PRIMITIVE_TABLE_C_SUBSET(WIDTH, FUNC_PREFIX, FUNC_PREFIX_DEF, FUNC_TYPE_CAST, DATA_TYPE1, DATA_TYPE2) \<br>
p.FUNC_PREFIX[PARTITION_ ## WIDTH ## x4] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<WIDTH, 4, DATA_TYPE1, DATA_TYPE2>; \<br>
p.FUNC_PREFIX[PARTITION_ ## WIDTH ## x8] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<WIDTH, 8, DATA_TYPE1, DATA_TYPE2>; \<br>
@@ -388,6 +387,41 @@<br>
}<br>
}<br></blockquote><div><br></div><div>Why use a capital L? this should just be lumasubtract_sp or even better pixelsub_sp_c.</div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
+void Lumasubstract_s_p(int bx, int by, short *a, intptr_t dstride, pixel *b0, pixel *b1, intptr_t sstride0, intptr_t sstride1)<br>
+{<br>
+ for (int y = 0; y < by; y++)<br>
+ {<br>
+ for (int x = 0; x < bx; x++)<br>
+ {<br>
+ a[x] = (short)(b0[x] - b1[x]);<br>
+ }<br>
+<br>
+ b0 += sstride0;<br>
+ b1 += sstride1;<br>
+ a += dstride;<br>
+ }<br>
+}<br></blockquote><div><br></div><div>There's no need for a separate primitive for chroma, just make two calls to pixelsub_sp. There's no efficiency gained by doing two at once.</div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
+void Chromasubstract_s_p(int bx, int by, short *dstu, intptr_t dstsrideu, short *dstv, intptr_t dstsridev, pixel *u0, pixel *u1, intptr_t sstrideu0, intptr_t sstrideu1,<br>
+ pixel *v0, pixel *v1, intptr_t sstridev0, intptr_t sstridev1)<br>
+{<br>
+ for (int y = 0; y < by; y++)<br>
+ {<br>
+ for (int x = 0; x < bx; x++)<br>
+ {<br>
+ dstu[x] = (short)(u0[x] - u1[x]);<br>
+ dstv[x] = (short)(v0[x] - v1[x]);<br>
+ }<br>
+<br>
+ u0 += sstrideu0;<br>
+ u1 += sstrideu1;<br>
+ v0 += sstridev0;<br>
+ v1 += sstridev1;<br>
+ dstu += dstsrideu;<br>
+ dstv += dstsridev;<br>
+ }<br>
+}<br>
+<br>
void blockcopy_p_s(int bx, int by, pixel *a, intptr_t stridea, short *b, intptr_t strideb)<br>
{<br>
for (int y = 0; y < by; y++)<br>
@@ -504,14 +538,15 @@<br>
void weightUnidir(short *src, pixel *dst, int srcStride, int dstStride, int width, int height, int w0, int round, int shift, int offset, int bitDepth)<br>
{<br>
int x, y;<br>
+<br>
for (y = height - 1; y >= 0; y--)<br>
{<br>
for (x = width - 1; x >= 0; )<br>
{<br>
// note: luma min width is 4<br>
- dst[x] = (pixel) Clip3(0, ((1 << bitDepth) - 1), ((w0 * (src[x] + IF_INTERNAL_OFFS) + round) >> shift) + offset);<br>
+ dst[x] = (pixel)Clip3(0, ((1 << bitDepth) - 1), ((w0 * (src[x] + IF_INTERNAL_OFFS) + round) >> shift) + offset);<br>
x--;<br>
- dst[x] = (pixel) Clip3(0, ((1 << bitDepth) - 1), ((w0 * (src[x] + IF_INTERNAL_OFFS) + round) >> shift) + offset);<br>
+ dst[x] = (pixel)Clip3(0, ((1 << bitDepth) - 1), ((w0 * (src[x] + IF_INTERNAL_OFFS) + round) >> shift) + offset);<br>
x--;<br>
}<br></blockquote><div><br></div><div>Unrelated changes, should be in a separate patch</div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
<br>
@@ -519,7 +554,6 @@<br>
dst += dstStride;<br>
}<br>
}<br>
-<br>
} // end anonymous namespace<br>
<br>
namespace x265 {<br>
@@ -619,6 +653,8 @@<br>
p.blockcpy_ps = blockcopy_p_s;<br>
p.blockcpy_sp = blockcopy_s_p;<br>
p.blockcpy_sc = blockcopy_s_c;<br>
+ p.LumaSubstract_sp = Lumasubstract_s_p;<br>
+ p.ChromaSubstract_sp = Chromasubstract_s_p;<br>
<br>
p.cvt16to32 = convert16to32;<br>
p.cvt16to32_shl = convert16to32_shl;<br>
diff -r c9bb72e8cb8e -r be5257d512be source/common/primitives.h<br>
--- a/source/common/primitives.h Mon Jul 15 23:41:11 2013 -0500<br>
+++ b/source/common/primitives.h Tue Jul 16 17:40:05 2013 +0530<br>
@@ -192,6 +192,9 @@<br>
typedef void (*ipfilter_s2p_t)(int bitDepth, short *src, int srcStride, pixel *dst, int dstStride, int width, int height);<br>
typedef void (*blockcpy_pp_t)(int bx, int by, pixel *dst, intptr_t dstride, pixel *src, intptr_t sstride); // dst is aligned<br>
typedef void (*blockcpy_sp_t)(int bx, int by, short *dst, intptr_t dstride, pixel *src, intptr_t sstride); // dst is aligned<br></blockquote><div><br></div><div>None of the other primitives have uppercase names. Please follow conventions. pixelsub_sp_t would be more appropriate.</div>
<div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
+typedef void (*LumaSubstract_sp_t)(int bx, int by, short *dst, intptr_t dstride, pixel *src0, pixel *src1, intptr_t sstride0, intptr_t sstride1); // dst is aligned<br>
+typedef void (*ChromaSubstract_sp_t)(int bx, int by, short *dstu, intptr_t dstsrideu, short *dstv, intptr_t dstsridev, pixel *u0, pixel *u1, intptr_t sstrideu0, intptr_t sstrideu1,<br>
+ pixel *v0, pixel *v1, intptr_t sstridev0, intptr_t sstridev1);<br>
typedef void (*blockcpy_ps_t)(int bx, int by, pixel *dst, intptr_t dstride, short *src, intptr_t sstride); // dst is aligned<br>
typedef void (*blockcpy_sc_t)(int bx, int by, short *dst, intptr_t dstride, uint8_t *src, intptr_t sstride); // dst is aligned<br>
typedef void (*intra_dc_t)(pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int bFilter);<br>
@@ -236,6 +239,10 @@<br>
blockcpy_ps_t blockcpy_ps; // block copy pixel from short<br>
blockcpy_sp_t blockcpy_sp; // block copy short from pixel<br>
blockcpy_sc_t blockcpy_sc; // block copy short from unsigned char<br>
+<br>
+ LumaSubstract_sp_t LumaSubstract_sp;<br>
+ ChromaSubstract_sp_t ChromaSubstract_sp;<br>
+<br>
cvt16to32_t cvt16to32;<br>
cvt16to32_shl_t cvt16to32_shl;<br>
cvt16to16_shl_t cvt16to16_shl;<br>
diff -r c9bb72e8cb8e -r be5257d512be source/common/vec/blockcopy.inc<br>
--- a/source/common/vec/blockcopy.inc Mon Jul 15 23:41:11 2013 -0500<br>
+++ b/source/common/vec/blockcopy.inc Tue Jul 16 17:40:05 2013 +0530<br>
@@ -79,7 +79,7 @@<br>
}<br>
}<br>
else<br></blockquote><div><br></div><div>More unrelated changes, should be in a different patch</div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
-#endif<br>
+#endif /* if INSTRSET >= 8 */<br>
if (!(aligncheck & 15))<br>
{<br>
// fast path, multiples of 16 pixel wide blocks<br>
@@ -131,7 +131,7 @@<br>
}<br>
}<br>
else<br>
-#endif<br>
+#endif /* if INSTRSET >= 8 && 0 */<br>
if (!(aligncheck & 15))<br>
{<br>
// fast path, multiples of 16 pixel wide blocks<br>
@@ -170,6 +170,7 @@<br>
void blockcopy_s_p(int bx, int by, short *dst, intptr_t dstride, uint8_t *src, intptr_t sstride)<br>
{<br>
size_t aligncheck = (size_t)dst | (size_t)src | bx | sstride | dstride;<br>
+<br>
#if INSTRSET >= 8 && 0<br>
if (!(aligncheck & 31))<br>
{<br>
@@ -189,7 +190,7 @@<br>
}<br>
}<br>
else<br>
-#endif<br>
+#endif /* if INSTRSET >= 8 && 0 */<br>
if (!(aligncheck & 15))<br>
{<br>
// fast path, multiples of 16 pixel wide blocks<br>
@@ -223,6 +224,173 @@<br>
}<br>
}<br></blockquote><div><br></div><div>use pixelsub_sp here</div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
+void Lumasubstract_s_p(int bx, int by, short *dst, intptr_t dstride, uint8_t *src0, uint8_t *src1, intptr_t sstride0, intptr_t sstride1)<br>
+{<br>
+ size_t aligncheck = (size_t)dst | (size_t)src0 | bx | sstride0 | dstride;<br>
+<br>
+#if INSTRSET >= 8 && 0<br>
+ if (!(aligncheck & 31))<br>
+ {<br>
+ // fast path, multiples of 32 pixel wide blocks<br>
+ // fast path, multiples of 16 pixel wide blocks<br>
+ for (int y = 0; y < by; y++)<br>
+ {<br>
+ for (int x = 0; x < bx; x += 32)<br>
+ {<br>
+ Vec32uc word0, word1;<br>
+ Vec16s word3, word4;<br>
+ word0.load_a(src0 + x);<br>
+ word1.load_a(src1 + x);<br>
+ word3 = extend_low(word0) - extend_low(word1);<br>
+ word4 = extend_high(word0) - extend_high(word1);<br>
+ word3.store_a(dst + x);<br>
+ word4.store_a(dst + x + 16);<br>
+ }<br>
+<br>
+ src0 += sstride0;<br>
+ src1 += sstride1;<br>
+ dst += dstride;<br>
+ }<br>
+ }<br>
+ else<br>
+#endif /* if INSTRSET >= 8 && 0 */<br>
+ if (!(aligncheck & 15))<br>
+ {<br>
+ // fast path, multiples of 16 pixel wide blocks<br>
+ for (int y = 0; y < by; y++)<br>
+ {<br>
+ for (int x = 0; x < bx; x += 16)<br>
+ {<br>
+ Vec16uc word0, word1;<br>
+ Vec8s word3, word4;<br>
+ word0.load_a(src0 + x);<br>
+ word1.load_a(src1 + x);<br>
+ word3 = extend_low(word0) - extend_low(word1);<br>
+ word4 = extend_high(word0) - extend_high(word1);<br>
+ word3.store_a(dst + x);<br>
+ word4.store_a(dst + x + 8);<br>
+ }<br>
+<br>
+ src0 += sstride0;<br>
+ src1 += sstride1;<br>
+ dst += dstride;<br>
+ }<br>
+ }<br>
+ else<br>
+ {<br></blockquote><div><br></div><div>The slow path should still be vectorized if bx is large enough. It just needs to use unaligned loads and stores. Perhaps another else if (bx >= 16) { vectorized and unaligned } clause</div>
<div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
+ // slow path, irregular memory alignments or sizes<br>
+ for (int y = 0; y < by; y++)<br>
+ {<br>
+ for (int x = 0; x < bx; x++)<br>
+ {<br>
+ dst[x] = (short)(src0[x] - src1[x]);<br>
+ }<br>
+<br>
+ src0 += sstride0;<br>
+ src1 += sstride1;<br>
+ dst += dstride;<br>
+ }<br>
+ }<br>
+}<br></blockquote><div><br></div><div>and drop the chroma function</div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
+void Chromasubstract_s_p(int bx, int by, short *dstu, intptr_t dstsrideu, short *dstv, intptr_t dstsridev, pixel *u0, pixel *u1, intptr_t sstrideu0, intptr_t sstrideu1,<br>
+ pixel *v0, pixel *v1, intptr_t sstridev0, intptr_t sstridev1)<br>
+{<br>
+ size_t aligncheck = (size_t)dstu | (size_t)u0 | bx | sstrideu1 | dstsrideu;<br>
+<br>
+#if INSTRSET >= 8 && 0<br>
+ if (!(aligncheck & 31))<br>
+ {<br>
+ // fast path, multiples of 32 pixel wide blocks<br>
+ // fast path, multiples of 16 pixel wide blocks<br>
+ for (int y = 0; y < by; y++)<br>
+ {<br>
+ for (int x = 0; x < bx; x += 32)<br>
+ {<br>
+ Vec32uc uword0, uword1;<br>
+ Vec16s uword3, uword4;<br>
+ uword0.load_a(u0 + x);<br>
+ uword1.load_a(u1 + x);<br>
+ uword3 = extend_low(uword0) - extend_low(uword1);<br>
+ uword4 = extend_high(uword0) - extend_high(uword1);<br>
+ uword3.store_a(dstu + x);<br>
+ uword4.store_a(dstu + x + 16);<br>
+<br>
+ Vec32uc vword0, vword1;<br>
+ Vec16s vword3, vword4;<br>
+ vword0.load_a(v0 + x);<br>
+ vword1.load_a(v1 + x);<br>
+ vword3 = extend_low(vword0) - extend_low(vword1);<br>
+ vword4 = extend_high(vword0) - extend_high(vword1);<br>
+ vword3.store_a(dstv + x);<br>
+ vword4.store_a(dstv + x + 16);<br>
+ }<br>
+<br>
+ u0 += sstrideu0;<br>
+ u1 += sstrideu1;<br>
+ v0 += sstridev0;<br>
+ v1 += sstridev1;<br>
+ dstu += dstsrideu;<br>
+ dstv += dstsridev;<br>
+ }<br>
+ }<br>
+ else<br>
+#endif /* if INSTRSET >= 8 && 0 */<br>
+ if (!(aligncheck & 15))<br>
+ {<br>
+ // fast path, multiples of 16 pixel wide blocks<br>
+ for (int y = 0; y < by; y++)<br>
+ {<br>
+ for (int x = 0; x < bx; x += 16)<br>
+ {<br>
+ Vec16uc uword0, uword1;<br>
+ Vec8s uword3, uword4;<br>
+ uword0.load_a(u0 + x);<br>
+ uword1.load_a(u1 + x);<br>
+ uword3 = extend_low(uword0) - extend_low(uword1);<br>
+ uword4 = extend_high(uword0) - extend_high(uword1);<br>
+ uword3.store_a(dstu + x);<br>
+ uword4.store_a(dstu + x + 8);<br>
+<br>
+ Vec16uc vword0, vword1;<br>
+ Vec8s vword3, vword4;<br>
+ vword0.load_a(v0 + x);<br>
+ vword1.load_a(v1 + x);<br>
+ vword3 = extend_low(vword0) - extend_low(vword1);<br>
+ vword4 = extend_high(vword0) - extend_high(vword1);<br>
+ vword3.store_a(dstv + x);<br>
+ vword4.store_a(dstv + x + 8);<br>
+ }<br>
+<br>
+ u0 += sstrideu0;<br>
+ u1 += sstrideu1;<br>
+ v0 += sstridev0;<br>
+ v1 += sstridev1;<br>
+ dstu += dstsrideu;<br>
+ dstv += dstsridev;<br>
+ }<br>
+ }<br>
+ else<br>
+ {<br>
+ // slow path, irregular memory alignments or sizes<br>
+ for (int y = 0; y < by; y++)<br>
+ {<br>
+ for (int x = 0; x < bx; x++)<br>
+ {<br>
+ dstu[x] = (short)(u0[x] - u1[x]);<br>
+ dstv[x] = (short)(v0[x] - v1[x]);<br>
+ }<br>
+<br>
+ u0 += sstrideu0;<br>
+ u1 += sstrideu1;<br>
+ v0 += sstridev0;<br>
+ v1 += sstridev1;<br>
+ dstu += dstsrideu;<br>
+ dstv += dstsridev;<br>
+ }<br>
+ }<br>
+}<br>
+<br>
void Setup_Vec_BlockCopyPrimitives(EncoderPrimitives &p)<br>
{<br>
#if HIGH_BIT_DEPTH<br>
@@ -231,10 +399,13 @@<br>
p.blockcpy_ps = (x265::blockcpy_ps_t)blockcopy_p_p;<br>
p.blockcpy_sp = (x265::blockcpy_sp_t)blockcopy_p_p;<br>
p.blockcpy_sc = (x265::blockcpy_sc_t)blockcopy_s_p;<br>
+ p.blockcpyyuv_sp = (x265::blockcpy_sc_t)blockcopyYuv_s_p;<br>
#else<br>
p.blockcpy_pp = blockcopy_p_p;<br>
p.blockcpy_ps = blockcopy_p_s;<br>
p.blockcpy_sp = blockcopy_s_p;<br>
p.blockcpy_sc = blockcopy_s_p;<br>
-#endif<br>
+ p.LumaSubstract_sp = Lumasubstract_s_p;<br>
+ p.ChromaSubstract_sp = Chromasubstract_s_p;<br>
+#endif /* if HIGH_BIT_DEPTH */<br>
}<br>
diff -r c9bb72e8cb8e -r be5257d512be source/test/pixelharness.cpp<br>
--- a/source/test/pixelharness.cpp Mon Jul 15 23:41:11 2013 -0500<br>
+++ b/source/test/pixelharness.cpp Tue Jul 16 17:40:05 2013 +0530<br>
@@ -56,10 +56,13 @@<br>
pbuf1 = (pixel*)TestHarness::alignedMalloc(sizeof(pixel), 64 * 64 * 32, 32);<br>
pbuf2 = (pixel*)TestHarness::alignedMalloc(sizeof(pixel), 64 * 64 * 32, 32);<br></blockquote><div><br></div><div>pbuf3 and pbuf4 are unnecessary once you drop the chroma function</div><div> <br></div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
+ pbuf3 = (pixel*)TestHarness::alignedMalloc(sizeof(pixel), 64 * 64 * 32, 32);<br>
+ pbuf4 = (pixel*)TestHarness::alignedMalloc(sizeof(pixel), 64 * 64 * 32, 32);<br>
+<br>
sbuf1 = (short*)TestHarness::alignedMalloc(sizeof(short), 64 * 64 * 32, 32);<br>
sbuf2 = (short*)TestHarness::alignedMalloc(sizeof(short), 64 * 64 * 32, 32);<br>
<br>
- if (!pbuf1 || !pbuf2)<br>
+ if (!pbuf1 || !pbuf2 | !pbuf3 | !pbuf4)<br>
{<br>
fprintf(stderr, "malloc failed, unable to initiate tests!\n");<br>
exit(1);<br>
@@ -71,6 +74,9 @@<br>
pbuf1[i] = rand() & PIXEL_MAX;<br>
pbuf2[i] = rand() & PIXEL_MAX;<br>
<br>
+ pbuf3[i] = rand() & PIXEL_MAX;<br>
+ pbuf4[i] = rand() & PIXEL_MAX;<br>
+<br>
sbuf1[i] = rand() & PIXEL_MAX;<br>
sbuf2[i] = rand() & PIXEL_MAX;<br>
}<br>
@@ -222,6 +228,59 @@<br>
return true;<br>
}<br>
<br>
+bool PixelHarness::check_LumaSubstract_s_p(x265::LumaSubstract_sp_t ref, x265::LumaSubstract_sp_t opt)<br>
+{<br>
+ ALIGN_VAR_16(short, ref_dest[64 * 64]);<br>
+ ALIGN_VAR_16(short, opt_dest[64 * 64]);<br>
+ int bx = 64;<br>
+ int by = 64;<br>
+ int j = 0;<br>
+ for (int i = 0; i <= 100; i++)<br>
+ {<br>
+ opt(bx, by, opt_dest, 64, pbuf2 + j, pbuf1 + j, 128, 128);<br>
+ ref(bx, by, ref_dest, 64, pbuf2 + j, pbuf1 + j, 128, 128);<br>
+<br>
+ if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(short)))<br>
+ return false;<br>
+<br>
+ j += 4;<br>
+ bx = 4 * ((rand() & 15) + 1);<br>
+ by = 4 * ((rand() & 15) + 1);<br>
+ }<br>
+<br>
+ return true;<br>
+}<br>
+<br>
+bool PixelHarness::check_ChromaSubstract_s_p(x265::ChromaSubstract_sp_t ref, x265::ChromaSubstract_sp_t opt)<br>
+{<br>
+ ALIGN_VAR_16(short, ref_destu[64 * 64]);<br>
+ ALIGN_VAR_16(short, opt_destu[64 * 64]);<br>
+<br>
+ ALIGN_VAR_16(short, ref_destv[64 * 64]);<br>
+ ALIGN_VAR_16(short, opt_destv[64 * 64]);<br>
+<br>
+ int bx = 64;<br>
+ int by = 64;<br>
+ int j = 0;<br>
+ for (int i = 0; i <= 100; i++)<br>
+ {<br>
+ opt(bx, by, opt_destu, 64, opt_destv, 64, pbuf2 + j, pbuf1 + j, 128, 128, pbuf3 + j, pbuf4 + j, 128, 128);<br>
+ ref(bx, by, ref_destu, 64, ref_destv, 64, pbuf2 + j, pbuf1 + j, 128, 128, pbuf3 + j, pbuf4 + j, 128, 128);<br>
+<br>
+ if (memcmp(ref_destu, opt_destu, 64 * 64 * sizeof(short)))<br>
+ return false;<br>
+<br>
+ if (memcmp(ref_destv, opt_destv, 64 * 64 * sizeof(short)))<br>
+ return false;<br>
+<br>
+ j += 4;<br>
+ bx = 4 * ((rand() & 15) + 1);<br>
+ by = 4 * ((rand() & 15) + 1);<br>
+ }<br>
+<br>
+ return true;<br>
+}<br>
+<br>
bool PixelHarness::check_block_copy_s_c(x265::blockcpy_sc_t ref, x265::blockcpy_sc_t opt)<br>
{<br>
ALIGN_VAR_16(short, ref_dest[64 * 64]);<br>
@@ -341,14 +400,15 @@<br>
int offset = (rand() % 256) - 128;<br>
for (int i = 0; i <= 100; i++)<br>
{<br>
- opt(sbuf1+j, opt_dest, 64, 64, width, height, w0, round, shift, offset, BIT_DEPTH);<br>
- ref(sbuf1+j, ref_dest, 64, 64, width, height, w0, round, shift, offset, BIT_DEPTH);<br>
+ opt(sbuf1 + j, opt_dest, 64, 64, width, height, w0, round, shift, offset, BIT_DEPTH);<br>
+ ref(sbuf1 + j, ref_dest, 64, 64, width, height, w0, round, shift, offset, BIT_DEPTH);<br></blockquote><div><br></div><div>unrelated changes</div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
<br>
if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))<br>
return false;<br>
<br>
j += 4;<br>
}<br>
+<br>
return true;<br>
}<br>
<br>
@@ -484,6 +544,24 @@<br>
}<br>
}<br>
<br>
+ if (opt.LumaSubstract_sp)<br>
+ {<br>
+ if (!check_LumaSubstract_s_p(ref.LumaSubstract_sp, opt.LumaSubstract_sp))<br>
+ {<br>
+ printf("Luma Substract failed!\n");<br>
+ return false;<br>
+ }<br>
+ }<br>
+<br>
+ if (opt.ChromaSubstract_sp)<br>
+ {<br>
+ if (!check_ChromaSubstract_s_p(ref.ChromaSubstract_sp, opt.ChromaSubstract_sp))<br>
+ {<br>
+ printf("Chroma Substract failed!\n");<br>
+ return false;<br>
+ }<br>
+ }<br>
+<br>
if (opt.blockcpy_sc)<br>
{<br>
if (!check_block_copy_s_c(ref.blockcpy_sc, opt.blockcpy_sc))<br>
@@ -492,7 +570,7 @@<br>
return false;<br>
}<br>
}<br></blockquote><div><br></div><div>unrelated changes</div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
-<br>
+<br>
if (opt.weightpUni)<br>
{<br>
if (!check_weightpUni(ref.weightpUni, opt.weightpUni))<br>
@@ -502,7 +580,6 @@<br>
}<br>
}<br>
<br>
-<br>
return true;<br>
}<br>
<br>
@@ -600,6 +677,18 @@<br>
REPORT_SPEEDUP(opt.blockcpy_sp, ref.blockcpy_sp, 64, 64, (short*)pbuf1, FENC_STRIDE, pbuf2, STRIDE);<br>
}<br>
<br>
+ if (opt.LumaSubstract_sp)<br>
+ {<br>
+ printf("Luma Sub");<br>
+ REPORT_SPEEDUP(opt.LumaSubstract_sp, ref.LumaSubstract_sp, 64, 64, (short*)pbuf1, FENC_STRIDE, pbuf2, pbuf1, STRIDE, STRIDE);<br>
+ }<br>
+<br>
+ if (opt.ChromaSubstract_sp)<br>
+ {<br>
+ printf("Chroma Sub");<br>
+ REPORT_SPEEDUP(opt.ChromaSubstract_sp, ref.ChromaSubstract_sp, 64, 64, (short*)pbuf1, FENC_STRIDE, (short*)pbuf2, FENC_STRIDE, pbuf3, pbuf3, STRIDE, STRIDE, pbuf4, pbuf4, STRIDE, STRIDE);<br>
+ }<br>
+<br>
if (opt.blockcpy_sc)<br>
{<br>
printf("s_c cpy");<br>
@@ -609,6 +698,6 @@<br>
if (opt.weightpUni)<br>
{<br></blockquote><div><br></div><div>unrelated changes</div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
printf("WeightpUni");<br>
- REPORT_SPEEDUP(opt.weightpUni, ref.weightpUni, sbuf1, pbuf1, 64, 64, 32, 32, 128, 1<<9, 10, 100, BIT_DEPTH);<br>
+ REPORT_SPEEDUP(opt.weightpUni, ref.weightpUni, sbuf1, pbuf1, 64, 64, 32, 32, 128, 1 << 9, 10, 100, BIT_DEPTH);<br>
}<br>
}<br>
diff -r c9bb72e8cb8e -r be5257d512be source/test/pixelharness.h<br>
--- a/source/test/pixelharness.h Mon Jul 15 23:41:11 2013 -0500<br>
+++ b/source/test/pixelharness.h Tue Jul 16 17:40:05 2013 +0530<br>
@@ -31,7 +31,7 @@<br>
{<br>
protected:<br>
<br>
- pixel *pbuf1, *pbuf2;<br>
+ pixel *pbuf1, *pbuf2, *pbuf3, *pbuf4;<br>
<br>
short *sbuf1, *sbuf2;<br>
<br>
@@ -42,6 +42,8 @@<br>
bool check_pixelcmp_x4(x265::pixelcmp_x4_t ref, x265::pixelcmp_x4_t opt);<br>
bool check_block_copy(x265::blockcpy_pp_t ref, x265::blockcpy_pp_t opt);<br>
bool check_block_copy_s_p(x265::blockcpy_sp_t ref, x265::blockcpy_sp_t opt);<br></blockquote><div><br></div><div>tabs? really?</div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
+ bool check_LumaSubstract_s_p(x265::LumaSubstract_sp_t ref, x265::LumaSubstract_sp_t opt);<br>
+ bool check_ChromaSubstract_s_p(x265::ChromaSubstract_sp_t ref, x265::ChromaSubstract_sp_t opt);<br>
bool check_block_copy_p_s(x265::blockcpy_ps_t ref, x265::blockcpy_ps_t opt);<br>
bool check_block_copy_s_c(x265::blockcpy_sc_t ref, x265::blockcpy_sc_t opt);<br>
bool check_calresidual(x265::calcresidual_t ref, x265::calcresidual_t opt);<br></blockquote></div><div><br></div>-- <br>Steve Borho
</div></div>