[x265] [PATCH] TShortYuv : Performance Primitives for Luma and Chroma Subtracting
gopu at multicorewareinc.com
gopu at multicorewareinc.com
Tue Jul 16 14:10:18 CEST 2013
# HG changeset patch
# User ggopu
# Date 1373976605 -19800
# Node ID be5257d512becc658a3c0f1d7a0a7defd0c911af
# Parent c9bb72e8cb8effc0d1d0e99f0b9abc8d341c652a
TShortYuv : Performance Primitives for Luma and Chroma Subtracting
diff -r c9bb72e8cb8e -r be5257d512be source/common/TShortYUV.cpp
--- a/source/common/TShortYUV.cpp Mon Jul 15 23:41:11 2013 -0500
+++ b/source/common/TShortYUV.cpp Tue Jul 16 17:40:05 2013 +0530
@@ -30,6 +30,8 @@
#include "TShortYUV.h"
#include "TLibCommon/TComYuv.h"
+using namespace x265;
+
TShortYUV::TShortYUV()
{
YBuf = NULL;
@@ -76,61 +78,37 @@
subtractChroma(pcYuvSrc0, pcYuvSrc1, uiTrUnitIdx, uiPartSize >> 1);
}
-void TShortYUV::subtractLuma(TComYuv* pcYuvSrc0, TComYuv* pcYuvSrc1, unsigned int uiTrUnitIdx, unsigned int uiPartSize)
+void TShortYUV::subtractLuma(TComYuv* pcYuvSrc0, TComYuv* pcYuvSrc1, unsigned int trUnitIdx, unsigned int partSize)
{
- int x, y;
+ int x = partSize, y = partSize;
- Pel* pSrc0 = pcYuvSrc0->getLumaAddr(uiTrUnitIdx, uiPartSize);
- Pel* pSrc1 = pcYuvSrc1->getLumaAddr(uiTrUnitIdx, uiPartSize);
- Short* pDst = getLumaAddr(uiTrUnitIdx, uiPartSize);
+ Pel* src0 = pcYuvSrc0->getLumaAddr(trUnitIdx, partSize);
+ Pel* src1 = pcYuvSrc1->getLumaAddr(trUnitIdx, partSize);
+ Short* dst = getLumaAddr(trUnitIdx, partSize);
- int iSrc0Stride = pcYuvSrc0->getStride();
- int iSrc1Stride = pcYuvSrc1->getStride();
- int iDstStride = width;
+ int src0Stride = pcYuvSrc0->getStride();
+ int src1Stride = pcYuvSrc1->getStride();
+ int dstStride = width;
- for (y = uiPartSize - 1; y >= 0; y--)
- {
- for (x = uiPartSize - 1; x >= 0; x--)
- {
- pDst[x] = static_cast<short>(pSrc0[x]) - static_cast<short>(pSrc1[x]);
- }
-
- pSrc0 += iSrc0Stride;
- pSrc1 += iSrc1Stride;
- pDst += iDstStride;
- }
+ primitives.LumaSubstract_sp(x, y, dst, dstStride, src0, src1, src0Stride, src1Stride);
}
-void TShortYUV::subtractChroma(TComYuv* pcYuvSrc0, TComYuv* pcYuvSrc1, unsigned int uiTrUnitIdx, unsigned int uiPartSize)
+void TShortYUV::subtractChroma(TComYuv* pcYuvSrc0, TComYuv* pcYuvSrc1, unsigned int trUnitIdx, unsigned int partSize)
{
- int x, y;
+ int x = partSize, y = partSize;
- Pel* pSrcU0 = pcYuvSrc0->getCbAddr(uiTrUnitIdx, uiPartSize);
- Pel* pSrcU1 = pcYuvSrc1->getCbAddr(uiTrUnitIdx, uiPartSize);
- Pel* pSrcV0 = pcYuvSrc0->getCrAddr(uiTrUnitIdx, uiPartSize);
- Pel* pSrcV1 = pcYuvSrc1->getCrAddr(uiTrUnitIdx, uiPartSize);
- Short* pDstU = getCbAddr(uiTrUnitIdx, uiPartSize);
- Short* pDstV = getCrAddr(uiTrUnitIdx, uiPartSize);
+ Pel* srcU0 = pcYuvSrc0->getCbAddr(trUnitIdx, partSize);
+ Pel* srcU1 = pcYuvSrc1->getCbAddr(trUnitIdx, partSize);
+ Pel* srcV0 = pcYuvSrc0->getCrAddr(trUnitIdx, partSize);
+ Pel* srcV1 = pcYuvSrc1->getCrAddr(trUnitIdx, partSize);
+ Short* dstU = getCbAddr(trUnitIdx, partSize);
+ Short* dstV = getCrAddr(trUnitIdx, partSize);
- int iSrc0Stride = pcYuvSrc0->getCStride();
- int iSrc1Stride = pcYuvSrc1->getCStride();
- int iDstStride = Cwidth;
+ int src0Stride = pcYuvSrc0->getCStride();
+ int src1Stride = pcYuvSrc1->getCStride();
+ int dstStride = Cwidth;
- for (y = uiPartSize - 1; y >= 0; y--)
- {
- for (x = uiPartSize - 1; x >= 0; x--)
- {
- pDstU[x] = static_cast<short>(pSrcU0[x]) - static_cast<short>(pSrcU1[x]);
- pDstV[x] = static_cast<short>(pSrcV0[x]) - static_cast<short>(pSrcV1[x]);
- }
-
- pSrcU0 += iSrc0Stride;
- pSrcU1 += iSrc1Stride;
- pSrcV0 += iSrc0Stride;
- pSrcV1 += iSrc1Stride;
- pDstU += iDstStride;
- pDstV += iDstStride;
- }
+ primitives.ChromaSubstract_sp(x, y, dstU, dstStride, dstV, dstStride, srcU0, srcU1, src0Stride, src1Stride, srcV0, srcV1, src0Stride, src1Stride);
}
void TShortYUV::addClip(TShortYUV* pcYuvSrc0, TShortYUV* pcYuvSrc1, unsigned int uiTrUnitIdx, unsigned int uiPartSize)
diff -r c9bb72e8cb8e -r be5257d512be source/common/pixel.cpp
--- a/source/common/pixel.cpp Mon Jul 15 23:41:11 2013 -0500
+++ b/source/common/pixel.cpp Tue Jul 16 17:40:05 2013 +0530
@@ -30,7 +30,6 @@
#include "TLibCommon/CommonDef.h"
#include "TLibCommon/TComPrediction.h"
-
#define SET_FUNC_PRIMITIVE_TABLE_C_SUBSET(WIDTH, FUNC_PREFIX, FUNC_PREFIX_DEF, FUNC_TYPE_CAST, DATA_TYPE1, DATA_TYPE2) \
p.FUNC_PREFIX[PARTITION_ ## WIDTH ## x4] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<WIDTH, 4, DATA_TYPE1, DATA_TYPE2>; \
p.FUNC_PREFIX[PARTITION_ ## WIDTH ## x8] = (FUNC_TYPE_CAST)FUNC_PREFIX_DEF<WIDTH, 8, DATA_TYPE1, DATA_TYPE2>; \
@@ -388,6 +387,41 @@
}
}
+void Lumasubstract_s_p(int bx, int by, short *a, intptr_t dstride, pixel *b0, pixel *b1, intptr_t sstride0, intptr_t sstride1)
+{
+ for (int y = 0; y < by; y++)
+ {
+ for (int x = 0; x < bx; x++)
+ {
+ a[x] = (short)(b0[x] - b1[x]);
+ }
+
+ b0 += sstride0;
+ b1 += sstride1;
+ a += dstride;
+ }
+}
+
+void Chromasubstract_s_p(int bx, int by, short *dstu, intptr_t dstsrideu, short *dstv, intptr_t dstsridev, pixel *u0, pixel *u1, intptr_t sstrideu0, intptr_t sstrideu1,
+ pixel *v0, pixel *v1, intptr_t sstridev0, intptr_t sstridev1)
+{
+ for (int y = 0; y < by; y++)
+ {
+ for (int x = 0; x < bx; x++)
+ {
+ dstu[x] = (short)(u0[x] - u1[x]);
+ dstv[x] = (short)(v0[x] - v1[x]);
+ }
+
+ u0 += sstrideu0;
+ u1 += sstrideu1;
+ v0 += sstridev0;
+ v1 += sstridev1;
+ dstu += dstsrideu;
+ dstv += dstsridev;
+ }
+}
+
void blockcopy_p_s(int bx, int by, pixel *a, intptr_t stridea, short *b, intptr_t strideb)
{
for (int y = 0; y < by; y++)
@@ -504,14 +538,15 @@
void weightUnidir(short *src, pixel *dst, int srcStride, int dstStride, int width, int height, int w0, int round, int shift, int offset, int bitDepth)
{
int x, y;
+
for (y = height - 1; y >= 0; y--)
{
for (x = width - 1; x >= 0; )
{
// note: luma min width is 4
- dst[x] = (pixel) Clip3(0, ((1 << bitDepth) - 1), ((w0 * (src[x] + IF_INTERNAL_OFFS) + round) >> shift) + offset);
+ dst[x] = (pixel)Clip3(0, ((1 << bitDepth) - 1), ((w0 * (src[x] + IF_INTERNAL_OFFS) + round) >> shift) + offset);
x--;
- dst[x] = (pixel) Clip3(0, ((1 << bitDepth) - 1), ((w0 * (src[x] + IF_INTERNAL_OFFS) + round) >> shift) + offset);
+ dst[x] = (pixel)Clip3(0, ((1 << bitDepth) - 1), ((w0 * (src[x] + IF_INTERNAL_OFFS) + round) >> shift) + offset);
x--;
}
@@ -519,7 +554,6 @@
dst += dstStride;
}
}
-
} // end anonymous namespace
namespace x265 {
@@ -619,6 +653,8 @@
p.blockcpy_ps = blockcopy_p_s;
p.blockcpy_sp = blockcopy_s_p;
p.blockcpy_sc = blockcopy_s_c;
+ p.LumaSubstract_sp = Lumasubstract_s_p;
+ p.ChromaSubstract_sp = Chromasubstract_s_p;
p.cvt16to32 = convert16to32;
p.cvt16to32_shl = convert16to32_shl;
diff -r c9bb72e8cb8e -r be5257d512be source/common/primitives.h
--- a/source/common/primitives.h Mon Jul 15 23:41:11 2013 -0500
+++ b/source/common/primitives.h Tue Jul 16 17:40:05 2013 +0530
@@ -192,6 +192,9 @@
typedef void (*ipfilter_s2p_t)(int bitDepth, short *src, int srcStride, pixel *dst, int dstStride, int width, int height);
typedef void (*blockcpy_pp_t)(int bx, int by, pixel *dst, intptr_t dstride, pixel *src, intptr_t sstride); // dst is aligned
typedef void (*blockcpy_sp_t)(int bx, int by, short *dst, intptr_t dstride, pixel *src, intptr_t sstride); // dst is aligned
+typedef void (*LumaSubstract_sp_t)(int bx, int by, short *dst, intptr_t dstride, pixel *src0, pixel *src1, intptr_t sstride0, intptr_t sstride1); // dst is aligned
+typedef void (*ChromaSubstract_sp_t)(int bx, int by, short *dstu, intptr_t dstsrideu, short *dstv, intptr_t dstsridev, pixel *u0, pixel *u1, intptr_t sstrideu0, intptr_t sstrideu1,
+ pixel *v0, pixel *v1, intptr_t sstridev0, intptr_t sstridev1);
typedef void (*blockcpy_ps_t)(int bx, int by, pixel *dst, intptr_t dstride, short *src, intptr_t sstride); // dst is aligned
typedef void (*blockcpy_sc_t)(int bx, int by, short *dst, intptr_t dstride, uint8_t *src, intptr_t sstride); // dst is aligned
typedef void (*intra_dc_t)(pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int bFilter);
@@ -236,6 +239,10 @@
blockcpy_ps_t blockcpy_ps; // block copy pixel from short
blockcpy_sp_t blockcpy_sp; // block copy short from pixel
blockcpy_sc_t blockcpy_sc; // block copy short from unsigned char
+
+ LumaSubstract_sp_t LumaSubstract_sp;
+ ChromaSubstract_sp_t ChromaSubstract_sp;
+
cvt16to32_t cvt16to32;
cvt16to32_shl_t cvt16to32_shl;
cvt16to16_shl_t cvt16to16_shl;
diff -r c9bb72e8cb8e -r be5257d512be source/common/vec/blockcopy.inc
--- a/source/common/vec/blockcopy.inc Mon Jul 15 23:41:11 2013 -0500
+++ b/source/common/vec/blockcopy.inc Tue Jul 16 17:40:05 2013 +0530
@@ -79,7 +79,7 @@
}
}
else
-#endif
+#endif /* if INSTRSET >= 8 */
if (!(aligncheck & 15))
{
// fast path, multiples of 16 pixel wide blocks
@@ -131,7 +131,7 @@
}
}
else
-#endif
+#endif /* if INSTRSET >= 8 && 0 */
if (!(aligncheck & 15))
{
// fast path, multiples of 16 pixel wide blocks
@@ -170,6 +170,7 @@
void blockcopy_s_p(int bx, int by, short *dst, intptr_t dstride, uint8_t *src, intptr_t sstride)
{
size_t aligncheck = (size_t)dst | (size_t)src | bx | sstride | dstride;
+
#if INSTRSET >= 8 && 0
if (!(aligncheck & 31))
{
@@ -189,7 +190,7 @@
}
}
else
-#endif
+#endif /* if INSTRSET >= 8 && 0 */
if (!(aligncheck & 15))
{
// fast path, multiples of 16 pixel wide blocks
@@ -223,6 +224,173 @@
}
}
+void Lumasubstract_s_p(int bx, int by, short *dst, intptr_t dstride, uint8_t *src0, uint8_t *src1, intptr_t sstride0, intptr_t sstride1)
+{
+ size_t aligncheck = (size_t)dst | (size_t)src0 | bx | sstride0 | dstride;
+
+#if INSTRSET >= 8 && 0
+ if (!(aligncheck & 31))
+ {
+ // fast path, multiples of 32 pixel wide blocks
+ // fast path, multiples of 16 pixel wide blocks
+ for (int y = 0; y < by; y++)
+ {
+ for (int x = 0; x < bx; x += 32)
+ {
+ Vec32uc word0, word1;
+ Vec16s word3, word4;
+ word0.load_a(src0 + x);
+ word1.load_a(src1 + x);
+ word3 = extend_low(word0) - extend_low(word1);
+ word4 = extend_high(word0) - extend_high(word1);
+ word3.store_a(dst + x);
+ word4.store_a(dst + x + 16);
+ }
+
+ src0 += sstride0;
+ src1 += sstride1;
+ dst += dstride;
+ }
+ }
+ else
+#endif /* if INSTRSET >= 8 && 0 */
+ if (!(aligncheck & 15))
+ {
+ // fast path, multiples of 16 pixel wide blocks
+ for (int y = 0; y < by; y++)
+ {
+ for (int x = 0; x < bx; x += 16)
+ {
+ Vec16uc word0, word1;
+ Vec8s word3, word4;
+ word0.load_a(src0 + x);
+ word1.load_a(src1 + x);
+ word3 = extend_low(word0) - extend_low(word1);
+ word4 = extend_high(word0) - extend_high(word1);
+ word3.store_a(dst + x);
+ word4.store_a(dst + x + 8);
+ }
+
+ src0 += sstride0;
+ src1 += sstride1;
+ dst += dstride;
+ }
+ }
+ else
+ {
+ // slow path, irregular memory alignments or sizes
+ for (int y = 0; y < by; y++)
+ {
+ for (int x = 0; x < bx; x++)
+ {
+ dst[x] = (short)(src0[x] - src1[x]);
+ }
+
+ src0 += sstride0;
+ src1 += sstride1;
+ dst += dstride;
+ }
+ }
+}
+
+void Chromasubstract_s_p(int bx, int by, short *dstu, intptr_t dstsrideu, short *dstv, intptr_t dstsridev, pixel *u0, pixel *u1, intptr_t sstrideu0, intptr_t sstrideu1,
+ pixel *v0, pixel *v1, intptr_t sstridev0, intptr_t sstridev1)
+{
+ size_t aligncheck = (size_t)dstu | (size_t)u0 | bx | sstrideu1 | dstsrideu;
+
+#if INSTRSET >= 8 && 0
+ if (!(aligncheck & 31))
+ {
+ // fast path, multiples of 32 pixel wide blocks
+ // fast path, multiples of 16 pixel wide blocks
+ for (int y = 0; y < by; y++)
+ {
+ for (int x = 0; x < bx; x += 32)
+ {
+ Vec32uc uword0, uword1;
+ Vec16s uword3, uword4;
+ uword0.load_a(u0 + x);
+ uword1.load_a(u1 + x);
+ uword3 = extend_low(uword0) - extend_low(uword1);
+ uword4 = extend_high(uword0) - extend_high(uword1);
+ uword3.store_a(dstu + x);
+ uword4.store_a(dstu + x + 16);
+
+ Vec32uc vword0, vword1;
+ Vec16s vword3, vword4;
+ vword0.load_a(v0 + x);
+ vword1.load_a(v1 + x);
+ vword3 = extend_low(vword0) - extend_low(vword1);
+ vword4 = extend_high(vword0) - extend_high(vword1);
+ vword3.store_a(dstv + x);
+ vword4.store_a(dstv + x + 16);
+ }
+
+ u0 += sstrideu0;
+ u1 += sstrideu1;
+ v0 += sstridev0;
+ v1 += sstridev1;
+ dstu += dstsrideu;
+ dstv += dstsridev;
+ }
+ }
+ else
+#endif /* if INSTRSET >= 8 && 0 */
+ if (!(aligncheck & 15))
+ {
+ // fast path, multiples of 16 pixel wide blocks
+ for (int y = 0; y < by; y++)
+ {
+ for (int x = 0; x < bx; x += 16)
+ {
+ Vec16uc uword0, uword1;
+ Vec8s uword3, uword4;
+ uword0.load_a(u0 + x);
+ uword1.load_a(u1 + x);
+ uword3 = extend_low(uword0) - extend_low(uword1);
+ uword4 = extend_high(uword0) - extend_high(uword1);
+ uword3.store_a(dstu + x);
+ uword4.store_a(dstu + x + 8);
+
+ Vec16uc vword0, vword1;
+ Vec8s vword3, vword4;
+ vword0.load_a(v0 + x);
+ vword1.load_a(v1 + x);
+ vword3 = extend_low(vword0) - extend_low(vword1);
+ vword4 = extend_high(vword0) - extend_high(vword1);
+ vword3.store_a(dstv + x);
+ vword4.store_a(dstv + x + 8);
+ }
+
+ u0 += sstrideu0;
+ u1 += sstrideu1;
+ v0 += sstridev0;
+ v1 += sstridev1;
+ dstu += dstsrideu;
+ dstv += dstsridev;
+ }
+ }
+ else
+ {
+ // slow path, irregular memory alignments or sizes
+ for (int y = 0; y < by; y++)
+ {
+ for (int x = 0; x < bx; x++)
+ {
+ dstu[x] = (short)(u0[x] - u1[x]);
+ dstv[x] = (short)(v0[x] - v1[x]);
+ }
+
+ u0 += sstrideu0;
+ u1 += sstrideu1;
+ v0 += sstridev0;
+ v1 += sstridev1;
+ dstu += dstsrideu;
+ dstv += dstsridev;
+ }
+ }
+}
+
void Setup_Vec_BlockCopyPrimitives(EncoderPrimitives &p)
{
#if HIGH_BIT_DEPTH
@@ -231,10 +399,13 @@
p.blockcpy_ps = (x265::blockcpy_ps_t)blockcopy_p_p;
p.blockcpy_sp = (x265::blockcpy_sp_t)blockcopy_p_p;
p.blockcpy_sc = (x265::blockcpy_sc_t)blockcopy_s_p;
+ p.blockcpyyuv_sp = (x265::blockcpy_sc_t)blockcopyYuv_s_p;
#else
p.blockcpy_pp = blockcopy_p_p;
p.blockcpy_ps = blockcopy_p_s;
p.blockcpy_sp = blockcopy_s_p;
p.blockcpy_sc = blockcopy_s_p;
-#endif
+ p.LumaSubstract_sp = Lumasubstract_s_p;
+ p.ChromaSubstract_sp = Chromasubstract_s_p;
+#endif /* if HIGH_BIT_DEPTH */
}
diff -r c9bb72e8cb8e -r be5257d512be source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Mon Jul 15 23:41:11 2013 -0500
+++ b/source/test/pixelharness.cpp Tue Jul 16 17:40:05 2013 +0530
@@ -56,10 +56,13 @@
pbuf1 = (pixel*)TestHarness::alignedMalloc(sizeof(pixel), 64 * 64 * 32, 32);
pbuf2 = (pixel*)TestHarness::alignedMalloc(sizeof(pixel), 64 * 64 * 32, 32);
+ pbuf3 = (pixel*)TestHarness::alignedMalloc(sizeof(pixel), 64 * 64 * 32, 32);
+ pbuf4 = (pixel*)TestHarness::alignedMalloc(sizeof(pixel), 64 * 64 * 32, 32);
+
sbuf1 = (short*)TestHarness::alignedMalloc(sizeof(short), 64 * 64 * 32, 32);
sbuf2 = (short*)TestHarness::alignedMalloc(sizeof(short), 64 * 64 * 32, 32);
- if (!pbuf1 || !pbuf2)
+ if (!pbuf1 || !pbuf2 | !pbuf3 | !pbuf4)
{
fprintf(stderr, "malloc failed, unable to initiate tests!\n");
exit(1);
@@ -71,6 +74,9 @@
pbuf1[i] = rand() & PIXEL_MAX;
pbuf2[i] = rand() & PIXEL_MAX;
+ pbuf3[i] = rand() & PIXEL_MAX;
+ pbuf4[i] = rand() & PIXEL_MAX;
+
sbuf1[i] = rand() & PIXEL_MAX;
sbuf2[i] = rand() & PIXEL_MAX;
}
@@ -222,6 +228,59 @@
return true;
}
+bool PixelHarness::check_LumaSubstract_s_p(x265::LumaSubstract_sp_t ref, x265::LumaSubstract_sp_t opt)
+{
+ ALIGN_VAR_16(short, ref_dest[64 * 64]);
+ ALIGN_VAR_16(short, opt_dest[64 * 64]);
+ int bx = 64;
+ int by = 64;
+ int j = 0;
+ for (int i = 0; i <= 100; i++)
+ {
+ opt(bx, by, opt_dest, 64, pbuf2 + j, pbuf1 + j, 128, 128);
+ ref(bx, by, ref_dest, 64, pbuf2 + j, pbuf1 + j, 128, 128);
+
+ if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(short)))
+ return false;
+
+ j += 4;
+ bx = 4 * ((rand() & 15) + 1);
+ by = 4 * ((rand() & 15) + 1);
+ }
+
+ return true;
+}
+
+bool PixelHarness::check_ChromaSubstract_s_p(x265::ChromaSubstract_sp_t ref, x265::ChromaSubstract_sp_t opt)
+{
+ ALIGN_VAR_16(short, ref_destu[64 * 64]);
+ ALIGN_VAR_16(short, opt_destu[64 * 64]);
+
+ ALIGN_VAR_16(short, ref_destv[64 * 64]);
+ ALIGN_VAR_16(short, opt_destv[64 * 64]);
+
+ int bx = 64;
+ int by = 64;
+ int j = 0;
+ for (int i = 0; i <= 100; i++)
+ {
+ opt(bx, by, opt_destu, 64, opt_destv, 64, pbuf2 + j, pbuf1 + j, 128, 128, pbuf3 + j, pbuf4 + j, 128, 128);
+ ref(bx, by, ref_destu, 64, ref_destv, 64, pbuf2 + j, pbuf1 + j, 128, 128, pbuf3 + j, pbuf4 + j, 128, 128);
+
+ if (memcmp(ref_destu, opt_destu, 64 * 64 * sizeof(short)))
+ return false;
+
+ if (memcmp(ref_destv, opt_destv, 64 * 64 * sizeof(short)))
+ return false;
+
+ j += 4;
+ bx = 4 * ((rand() & 15) + 1);
+ by = 4 * ((rand() & 15) + 1);
+ }
+
+ return true;
+}
+
bool PixelHarness::check_block_copy_s_c(x265::blockcpy_sc_t ref, x265::blockcpy_sc_t opt)
{
ALIGN_VAR_16(short, ref_dest[64 * 64]);
@@ -341,14 +400,15 @@
int offset = (rand() % 256) - 128;
for (int i = 0; i <= 100; i++)
{
- opt(sbuf1+j, opt_dest, 64, 64, width, height, w0, round, shift, offset, BIT_DEPTH);
- ref(sbuf1+j, ref_dest, 64, 64, width, height, w0, round, shift, offset, BIT_DEPTH);
+ opt(sbuf1 + j, opt_dest, 64, 64, width, height, w0, round, shift, offset, BIT_DEPTH);
+ ref(sbuf1 + j, ref_dest, 64, 64, width, height, w0, round, shift, offset, BIT_DEPTH);
if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
return false;
j += 4;
}
+
return true;
}
@@ -484,6 +544,24 @@
}
}
+ if (opt.LumaSubstract_sp)
+ {
+ if (!check_LumaSubstract_s_p(ref.LumaSubstract_sp, opt.LumaSubstract_sp))
+ {
+ printf("Luma Substract failed!\n");
+ return false;
+ }
+ }
+
+ if (opt.ChromaSubstract_sp)
+ {
+ if (!check_ChromaSubstract_s_p(ref.ChromaSubstract_sp, opt.ChromaSubstract_sp))
+ {
+ printf("Chroma Substract failed!\n");
+ return false;
+ }
+ }
+
if (opt.blockcpy_sc)
{
if (!check_block_copy_s_c(ref.blockcpy_sc, opt.blockcpy_sc))
@@ -492,7 +570,7 @@
return false;
}
}
-
+
if (opt.weightpUni)
{
if (!check_weightpUni(ref.weightpUni, opt.weightpUni))
@@ -502,7 +580,6 @@
}
}
-
return true;
}
@@ -600,6 +677,18 @@
REPORT_SPEEDUP(opt.blockcpy_sp, ref.blockcpy_sp, 64, 64, (short*)pbuf1, FENC_STRIDE, pbuf2, STRIDE);
}
+ if (opt.LumaSubstract_sp)
+ {
+ printf("Luma Sub");
+ REPORT_SPEEDUP(opt.LumaSubstract_sp, ref.LumaSubstract_sp, 64, 64, (short*)pbuf1, FENC_STRIDE, pbuf2, pbuf1, STRIDE, STRIDE);
+ }
+
+ if (opt.ChromaSubstract_sp)
+ {
+ printf("Chroma Sub");
+ REPORT_SPEEDUP(opt.ChromaSubstract_sp, ref.ChromaSubstract_sp, 64, 64, (short*)pbuf1, FENC_STRIDE, (short*)pbuf2, FENC_STRIDE, pbuf3, pbuf3, STRIDE, STRIDE, pbuf4, pbuf4, STRIDE, STRIDE);
+ }
+
if (opt.blockcpy_sc)
{
printf("s_c cpy");
@@ -609,6 +698,6 @@
if (opt.weightpUni)
{
printf("WeightpUni");
- REPORT_SPEEDUP(opt.weightpUni, ref.weightpUni, sbuf1, pbuf1, 64, 64, 32, 32, 128, 1<<9, 10, 100, BIT_DEPTH);
+ REPORT_SPEEDUP(opt.weightpUni, ref.weightpUni, sbuf1, pbuf1, 64, 64, 32, 32, 128, 1 << 9, 10, 100, BIT_DEPTH);
}
}
diff -r c9bb72e8cb8e -r be5257d512be source/test/pixelharness.h
--- a/source/test/pixelharness.h Mon Jul 15 23:41:11 2013 -0500
+++ b/source/test/pixelharness.h Tue Jul 16 17:40:05 2013 +0530
@@ -31,7 +31,7 @@
{
protected:
- pixel *pbuf1, *pbuf2;
+ pixel *pbuf1, *pbuf2, *pbuf3, *pbuf4;
short *sbuf1, *sbuf2;
@@ -42,6 +42,8 @@
bool check_pixelcmp_x4(x265::pixelcmp_x4_t ref, x265::pixelcmp_x4_t opt);
bool check_block_copy(x265::blockcpy_pp_t ref, x265::blockcpy_pp_t opt);
bool check_block_copy_s_p(x265::blockcpy_sp_t ref, x265::blockcpy_sp_t opt);
+ bool check_LumaSubstract_s_p(x265::LumaSubstract_sp_t ref, x265::LumaSubstract_sp_t opt);
+ bool check_ChromaSubstract_s_p(x265::ChromaSubstract_sp_t ref, x265::ChromaSubstract_sp_t opt);
bool check_block_copy_p_s(x265::blockcpy_ps_t ref, x265::blockcpy_ps_t opt);
bool check_block_copy_s_c(x265::blockcpy_sc_t ref, x265::blockcpy_sc_t opt);
bool check_calresidual(x265::calcresidual_t ref, x265::calcresidual_t opt);
More information about the x265-devel
mailing list