[x265] [PATCH] TShortYUV : Implemented perfomance Primitives pixelsub_sp
gopu at multicorewareinc.com
gopu at multicorewareinc.com
Wed Jul 17 08:03:30 CEST 2013
# HG changeset patch
# User ggopu
# Date 1374041000 -19800
# Node ID 054d8c409569100c4aacb015ffb1b3281100d993
# Parent 0becdecde6ee77e4ec43daf8996a8eeb3f6f6131
TShortYUV : Implemented perfomance Primitives pixelsub_sp
diff -r 0becdecde6ee -r 054d8c409569 source/common/TShortYUV.cpp
--- a/source/common/TShortYUV.cpp Wed Jul 17 00:13:29 2013 -0500
+++ b/source/common/TShortYUV.cpp Wed Jul 17 11:33:20 2013 +0530
@@ -30,6 +30,8 @@
#include <assert.h>
#include <math.h>
+using namespace x265;
+
TShortYUV::TShortYUV()
{
YBuf = NULL;
@@ -76,61 +78,38 @@
subtractChroma(pcYuvSrc0, pcYuvSrc1, uiTrUnitIdx, uiPartSize >> 1);
}
-void TShortYUV::subtractLuma(TComYuv* pcYuvSrc0, TComYuv* pcYuvSrc1, unsigned int uiTrUnitIdx, unsigned int uiPartSize)
+void TShortYUV::subtractLuma(TComYuv* pcYuvSrc0, TComYuv* pcYuvSrc1, unsigned int trUnitIdx, unsigned int partSize)
{
- int x, y;
+ int x = partSize, y = partSize;
- Pel* pSrc0 = pcYuvSrc0->getLumaAddr(uiTrUnitIdx, uiPartSize);
- Pel* pSrc1 = pcYuvSrc1->getLumaAddr(uiTrUnitIdx, uiPartSize);
- Short* pDst = getLumaAddr(uiTrUnitIdx, uiPartSize);
+ Pel* src0 = pcYuvSrc0->getLumaAddr(trUnitIdx, partSize);
+ Pel* src1 = pcYuvSrc1->getLumaAddr(trUnitIdx, partSize);
+ Short* dst = getLumaAddr(trUnitIdx, partSize);
- int iSrc0Stride = pcYuvSrc0->getStride();
- int iSrc1Stride = pcYuvSrc1->getStride();
- int iDstStride = width;
+ int src0Stride = pcYuvSrc0->getStride();
+ int src1Stride = pcYuvSrc1->getStride();
+ int dstStride = width;
- for (y = uiPartSize - 1; y >= 0; y--)
- {
- for (x = uiPartSize - 1; x >= 0; x--)
- {
- pDst[x] = static_cast<short>(pSrc0[x]) - static_cast<short>(pSrc1[x]);
- }
-
- pSrc0 += iSrc0Stride;
- pSrc1 += iSrc1Stride;
- pDst += iDstStride;
- }
+ primitives.pixelsubsp(x, y, dst, dstStride, src0, src1, src0Stride, src1Stride);
}
-void TShortYUV::subtractChroma(TComYuv* pcYuvSrc0, TComYuv* pcYuvSrc1, unsigned int uiTrUnitIdx, unsigned int uiPartSize)
+void TShortYUV::subtractChroma(TComYuv* pcYuvSrc0, TComYuv* pcYuvSrc1, unsigned int trUnitIdx, unsigned int partSize)
{
- int x, y;
+ int x = partSize, y = partSize;
- Pel* pSrcU0 = pcYuvSrc0->getCbAddr(uiTrUnitIdx, uiPartSize);
- Pel* pSrcU1 = pcYuvSrc1->getCbAddr(uiTrUnitIdx, uiPartSize);
- Pel* pSrcV0 = pcYuvSrc0->getCrAddr(uiTrUnitIdx, uiPartSize);
- Pel* pSrcV1 = pcYuvSrc1->getCrAddr(uiTrUnitIdx, uiPartSize);
- Short* pDstU = getCbAddr(uiTrUnitIdx, uiPartSize);
- Short* pDstV = getCrAddr(uiTrUnitIdx, uiPartSize);
+ Pel* srcU0 = pcYuvSrc0->getCbAddr(trUnitIdx, partSize);
+ Pel* srcU1 = pcYuvSrc1->getCbAddr(trUnitIdx, partSize);
+ Pel* srcV0 = pcYuvSrc0->getCrAddr(trUnitIdx, partSize);
+ Pel* srcV1 = pcYuvSrc1->getCrAddr(trUnitIdx, partSize);
+ Short* dstU = getCbAddr(trUnitIdx, partSize);
+ Short* dstV = getCrAddr(trUnitIdx, partSize);
- int iSrc0Stride = pcYuvSrc0->getCStride();
- int iSrc1Stride = pcYuvSrc1->getCStride();
- int iDstStride = Cwidth;
+ int src0Stride = pcYuvSrc0->getCStride();
+ int src1Stride = pcYuvSrc1->getCStride();
+ int dstStride = Cwidth;
- for (y = uiPartSize - 1; y >= 0; y--)
- {
- for (x = uiPartSize - 1; x >= 0; x--)
- {
- pDstU[x] = static_cast<short>(pSrcU0[x]) - static_cast<short>(pSrcU1[x]);
- pDstV[x] = static_cast<short>(pSrcV0[x]) - static_cast<short>(pSrcV1[x]);
- }
-
- pSrcU0 += iSrc0Stride;
- pSrcU1 += iSrc1Stride;
- pSrcV0 += iSrc0Stride;
- pSrcV1 += iSrc1Stride;
- pDstU += iDstStride;
- pDstV += iDstStride;
- }
+ primitives.pixelsubsp(x, y, dstU, dstStride, srcU0, srcU1, src0Stride, src1Stride);
+ primitives.pixelsubsp(x, y, dstV, dstStride, srcV0, srcV1, src0Stride, src1Stride);
}
void TShortYUV::addClip(TShortYUV* pcYuvSrc0, TShortYUV* pcYuvSrc1, unsigned int uiTrUnitIdx, unsigned int uiPartSize)
diff -r 0becdecde6ee -r 054d8c409569 source/common/pixel.cpp
--- a/source/common/pixel.cpp Wed Jul 17 00:13:29 2013 -0500
+++ b/source/common/pixel.cpp Wed Jul 17 11:33:20 2013 +0530
@@ -520,6 +520,21 @@
}
}
+void pixelsub_sp(int bx, int by, short *a, intptr_t dstride, pixel *b0, pixel *b1, intptr_t sstride0, intptr_t sstride1)
+{
+ for (int y = 0; y < by; y++)
+ {
+ for (int x = 0; x < bx; x++)
+ {
+ a[x] = (short)(b0[x] - b1[x]);
+ }
+
+ b0 += sstride0;
+ b1 += sstride1;
+ a += dstride;
+ }
+}
+
} // end anonymous namespace
namespace x265 {
@@ -721,5 +736,7 @@
p.transpose[4] = transpose<64>;
p.weightpUni = weightUnidir;
+
+ p.pixelsubsp = pixelsub_sp;
}
}
diff -r 0becdecde6ee -r 054d8c409569 source/common/primitives.h
--- a/source/common/primitives.h Wed Jul 17 00:13:29 2013 -0500
+++ b/source/common/primitives.h Wed Jul 17 11:33:20 2013 +0530
@@ -215,6 +215,7 @@
typedef void (*dequant_t)(int bitDepth, const int* src, int* dst, int width, int height, int mcqp_miper, int mcqp_mirem, bool useScalingList, unsigned int trSizeLog2, int *dequantCoef);
typedef uint32_t (*quant_t)(int *coef, int *quantCoeff, int *deltaU, int *qCoef, int qBits, int add, int numCoeff);
typedef void (*weightpUni_t)(short *src, pixel *dst, int srcStride, int dstStride, int width, int height, int w0, int round, int shift, int offset, int bitDepth);
+typedef void (*pixelsub_sp_t)(int bx, int by, short *dst, intptr_t dstride, pixel *src0, pixel *src1, intptr_t sstride0, intptr_t sstride1);
/* Define a structure containing function pointers to optimized encoder
* primitives. Each pointer can reference either an assembly routine,
@@ -265,6 +266,7 @@
transpose_t transpose[NUM_SQUARE_BLOCKS];
weightpUni_t weightpUni;
+ pixelsub_sp_t pixelsubsp;
};
/* This copy of the table is what gets used by the encoder.
diff -r 0becdecde6ee -r 054d8c409569 source/common/vec/blockcopy.inc
--- a/source/common/vec/blockcopy.inc Wed Jul 17 00:13:29 2013 -0500
+++ b/source/common/vec/blockcopy.inc Wed Jul 17 11:33:20 2013 +0530
@@ -223,6 +223,75 @@
}
}
+void pixelsub_sp(int bx, int by, short *dst, intptr_t dstride, uint8_t *src0, uint8_t *src1, intptr_t sstride0, intptr_t sstride1)
+{
+ size_t aligncheck = (size_t)dst | (size_t)src0 | bx | sstride0 | dstride;
+
+#if INSTRSET >= 8 && 0
+ if (!(aligncheck & 31))
+ {
+ // fast path, multiples of 32 pixel wide blocks
+ // fast path, multiples of 16 pixel wide blocks
+ for (int y = 0; y < by; y++)
+ {
+ for (int x = 0; x < bx; x += 32)
+ {
+ Vec32uc word0, word1;
+ Vec16s word3, word4;
+ word0.load_a(src0 + x);
+ word1.load_a(src1 + x);
+ word3 = extend_low(word0) - extend_low(word1);
+ word4 = extend_high(word0) - extend_high(word1);
+ word3.store_a(dst + x);
+ word4.store_a(dst + x + 16);
+ }
+
+ src0 += sstride0;
+ src1 += sstride1;
+ dst += dstride;
+ }
+ }
+ else
+#endif /* if INSTRSET >= 8 && 0 */
+ if (!(aligncheck & 15))
+ {
+ // fast path, multiples of 16 pixel wide blocks
+ for (int y = 0; y < by; y++)
+ {
+ for (int x = 0; x < bx; x += 16)
+ {
+ Vec16uc word0, word1;
+ Vec8s word3, word4;
+ word0.load_a(src0 + x);
+ word1.load_a(src1 + x);
+ word3 = extend_low(word0) - extend_low(word1);
+ word4 = extend_high(word0) - extend_high(word1);
+ word3.store_a(dst + x);
+ word4.store_a(dst + x + 8);
+ }
+
+ src0 += sstride0;
+ src1 += sstride1;
+ dst += dstride;
+ }
+ }
+ else
+ {
+ // slow path, irregular memory alignments or sizes
+ for (int y = 0; y < by; y++)
+ {
+ for (int x = 0; x < bx; x++)
+ {
+ dst[x] = (short)(src0[x] - src1[x]);
+ }
+
+ src0 += sstride0;
+ src1 += sstride1;
+ dst += dstride;
+ }
+ }
+}
+
void Setup_Vec_BlockCopyPrimitives(EncoderPrimitives &p)
{
#if HIGH_BIT_DEPTH
@@ -231,10 +300,12 @@
p.blockcpy_ps = (x265::blockcpy_ps_t)blockcopy_p_p;
p.blockcpy_sp = (x265::blockcpy_sp_t)blockcopy_p_p;
p.blockcpy_sc = (x265::blockcpy_sc_t)blockcopy_s_p;
+ p.pixelsubsp = pixelsub_sp;
#else
p.blockcpy_pp = blockcopy_p_p;
p.blockcpy_ps = blockcopy_p_s;
p.blockcpy_sp = blockcopy_s_p;
p.blockcpy_sc = blockcopy_s_p;
+ p.pixelsubsp = pixelsub_sp;
#endif
}
diff -r 0becdecde6ee -r 054d8c409569 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Wed Jul 17 00:13:29 2013 -0500
+++ b/source/test/pixelharness.cpp Wed Jul 17 11:33:20 2013 +0530
@@ -353,6 +353,29 @@
return true;
}
+bool PixelHarness::check_pixelsub_sp(x265::pixelsub_sp_t ref, x265::pixelsub_sp_t opt)
+{
+ ALIGN_VAR_16(short, ref_dest[64 * 64]);
+ ALIGN_VAR_16(short, opt_dest[64 * 64]);
+ int bx = 64;
+ int by = 64;
+ int j = 0;
+ for (int i = 0; i <= 100; i++)
+ {
+ opt(bx, by, opt_dest, 64, pbuf2 + j, pbuf1 + j, 128, 128);
+ ref(bx, by, ref_dest, 64, pbuf2 + j, pbuf1 + j, 128, 128);
+
+ if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(short)))
+ return false;
+
+ j += 4;
+ bx = 4 * ((rand() & 15) + 1);
+ by = 4 * ((rand() & 15) + 1);
+ }
+
+ return true;
+}
+
bool PixelHarness::testCorrectness(const EncoderPrimitives& ref, const EncoderPrimitives& opt)
{
for (uint16_t curpar = 0; curpar < NUM_PARTITIONS; curpar++)
@@ -503,6 +526,15 @@
}
}
+ if (opt.pixelsubsp)
+ {
+ if (!check_pixelsub_sp(ref.pixelsubsp, opt.pixelsubsp))
+ {
+ printf("Luma Substract failed!\n");
+ return false;
+ }
+ }
+
return true;
}
@@ -611,4 +643,10 @@
printf("WeightpUni");
REPORT_SPEEDUP(opt.weightpUni, ref.weightpUni, sbuf1, pbuf1, 64, 64, 32, 32, 128, 1 << 9, 10, 100, BIT_DEPTH);
}
+
+ if (opt.pixelsubsp)
+ {
+ printf("Pixel Sub");
+ REPORT_SPEEDUP(opt.pixelsubsp, ref.pixelsubsp, 64, 64, (short*)pbuf1, FENC_STRIDE, pbuf2, pbuf1, STRIDE, STRIDE);
+ }
}
diff -r 0becdecde6ee -r 054d8c409569 source/test/pixelharness.h
--- a/source/test/pixelharness.h Wed Jul 17 00:13:29 2013 -0500
+++ b/source/test/pixelharness.h Wed Jul 17 11:33:20 2013 +0530
@@ -47,6 +47,7 @@
bool check_calresidual(x265::calcresidual_t ref, x265::calcresidual_t opt);
bool check_calcrecon(x265::calcrecon_t ref, x265::calcrecon_t opt);
bool check_weightpUni(x265::weightpUni_t ref, x265::weightpUni_t opt);
+ bool check_pixelsub_sp(x265::pixelsub_sp_t ref, x265::pixelsub_sp_t opt);
public:
More information about the x265-devel
mailing list