[x265] [PATCH] TShortYUV: asm code integration for pixelsub_ps
murugan at multicorewareinc.com
murugan at multicorewareinc.com
Mon Nov 18 13:22:15 CET 2013
# HG changeset patch
# User Murugan Vairavel <murugan at multicorewareinc.com>
# Date 1384777276 -19800
# Mon Nov 18 17:51:16 2013 +0530
# Node ID be8373f115dd7f152588ba8c575ad10dc6f5afb1
# Parent c355ba4b6711bfad87ff37d650a8f1946f878eec
TShortYUV: asm code integration for pixelsub_ps
diff -r c355ba4b6711 -r be8373f115dd source/common/TShortYUV.cpp
--- a/source/common/TShortYUV.cpp Mon Nov 18 16:49:30 2013 +0530
+++ b/source/common/TShortYUV.cpp Mon Nov 18 17:51:16 2013 +0530
@@ -58,6 +58,7 @@
m_cwidth = width >> m_hChromaShift;
m_cheight = height >> m_vChromaShift;
+ m_csp = csp;
}
void TShortYUV::destroy()
@@ -78,15 +79,14 @@
}
void TShortYUV::subtract(TComYuv* srcYuv0, TComYuv* srcYuv1, unsigned int trUnitIdx, unsigned int partSize)
-{
- subtractLuma(srcYuv0, srcYuv1, trUnitIdx, partSize);
- subtractChroma(srcYuv0, srcYuv1, trUnitIdx, partSize >> m_hChromaShift);
+{
+ int part = partitionFromSizes(partSize, partSize);
+ subtractLuma(srcYuv0, srcYuv1, trUnitIdx, partSize, part);
+ subtractChroma(srcYuv0, srcYuv1, trUnitIdx, partSize >> m_hChromaShift, part);
}
-void TShortYUV::subtractLuma(TComYuv* srcYuv0, TComYuv* srcYuv1, unsigned int trUnitIdx, unsigned int partSize)
+void TShortYUV::subtractLuma(TComYuv* srcYuv0, TComYuv* srcYuv1, unsigned int trUnitIdx, unsigned int partSize, uint32_t part)
{
- int x = partSize, y = partSize;
-
Pel* src0 = srcYuv0->getLumaAddr(trUnitIdx, partSize);
Pel* src1 = srcYuv1->getLumaAddr(trUnitIdx, partSize);
int16_t* dst = getLumaAddr(trUnitIdx, partSize);
@@ -95,13 +95,11 @@
int src1Stride = srcYuv1->getStride();
int dstStride = m_width;
- primitives.pixelsub_ps(x, y, dst, dstStride, src0, src1, src0Stride, src1Stride);
+ primitives.luma_sub_ps[part](dst, dstStride, src0, src1, src0Stride, src1Stride);
}
-void TShortYUV::subtractChroma(TComYuv* srcYuv0, TComYuv* srcYuv1, unsigned int trUnitIdx, unsigned int partSize)
+void TShortYUV::subtractChroma(TComYuv* srcYuv0, TComYuv* srcYuv1, unsigned int trUnitIdx, unsigned int partSize, uint32_t part)
{
- int x = partSize, y = partSize;
-
Pel* srcU0 = srcYuv0->getCbAddr(trUnitIdx, partSize);
Pel* srcU1 = srcYuv1->getCbAddr(trUnitIdx, partSize);
Pel* srcV0 = srcYuv0->getCrAddr(trUnitIdx, partSize);
@@ -113,8 +111,8 @@
int src1Stride = srcYuv1->getCStride();
int dstStride = m_cwidth;
- primitives.pixelsub_ps(x, y, dstU, dstStride, srcU0, srcU1, src0Stride, src1Stride);
- primitives.pixelsub_ps(x, y, dstV, dstStride, srcV0, srcV1, src0Stride, src1Stride);
+ primitives.chroma_sub_ps[m_csp][part](dstU, dstStride, srcU0, srcU1, src0Stride, src1Stride);
+ primitives.chroma_sub_ps[m_csp][part](dstV, dstStride, srcV0, srcV1, src0Stride, src1Stride);
}
void TShortYUV::addClip(TShortYUV* srcYuv0, TShortYUV* srcYuv1, unsigned int trUnitIdx, unsigned int partSize)
diff -r c355ba4b6711 -r be8373f115dd source/common/TShortYUV.h
--- a/source/common/TShortYUV.h Mon Nov 18 16:49:30 2013 +0530
+++ b/source/common/TShortYUV.h Mon Nov 18 17:51:16 2013 +0530
@@ -53,6 +53,8 @@
return blkX + blkY * size;
}
+ int m_csp;
+
public:
int16_t* m_bufY;
@@ -95,8 +97,8 @@
int16_t* getCrAddr(unsigned int partIdx, unsigned int size) { return m_bufCr + getAddrOffset(partIdx, size, m_cwidth); }
- void subtractLuma(TComYuv* srcYuv0, TComYuv* srcYuv1, unsigned int trUnitIdx, unsigned int partSize);
- void subtractChroma(TComYuv* srcYuv0, TComYuv* srcYuv1, unsigned int trUnitIdx, unsigned int partSize);
+ void subtractLuma(TComYuv* srcYuv0, TComYuv* srcYuv1, unsigned int trUnitIdx, unsigned int partSize, uint32_t part);
+ void subtractChroma(TComYuv* srcYuv0, TComYuv* srcYuv1, unsigned int trUnitIdx, unsigned int partSize, uint32_t part);
void subtract(TComYuv* srcYuv0, TComYuv* srcYuv1, unsigned int trUnitIdx, unsigned int partSize);
void addClip(TShortYUV* srcYuv0, TShortYUV* srcYuv1, unsigned int trUnitIdx, unsigned int partSize);
diff -r c355ba4b6711 -r be8373f115dd source/common/pixel.cpp
--- a/source/common/pixel.cpp Mon Nov 18 16:49:30 2013 +0530
+++ b/source/common/pixel.cpp Mon Nov 18 17:51:16 2013 +0530
@@ -838,7 +838,7 @@
p.chroma_copy_pp[CSP_I420][CHROMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
p.chroma_copy_sp[CHROMA_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
p.chroma_copy_ps[CHROMA_ ## W ## x ## H] = blockcopy_ps_c<W, H>;\
- p.chroma_sub_ps[CHROMA_ ## W ## x ## H] = pixel_sub_ps_c<W, H>;
+ p.chroma_sub_ps[CSP_I420][CHROMA_ ## W ## x ## H] = pixel_sub_ps_c<W, H>;
#define LUMA(W, H) \
p.luma_copy_pp[LUMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
diff -r c355ba4b6711 -r be8373f115dd source/common/primitives.h
--- a/source/common/primitives.h Mon Nov 18 16:49:30 2013 +0530
+++ b/source/common/primitives.h Mon Nov 18 17:51:16 2013 +0530
@@ -250,7 +250,7 @@
copy_ps_t chroma_copy_ps[NUM_CHROMA_PARTITIONS];
pixel_sub_ps_t luma_sub_ps[NUM_LUMA_PARTITIONS];
- pixel_sub_ps_t chroma_sub_ps[NUM_CHROMA_PARTITIONS];
+ pixel_sub_ps_t chroma_sub_ps[NUM_CSP][NUM_CHROMA_PARTITIONS];
ipfilter_ps_t ipfilter_ps[NUM_IPFILTER_P_S];
ipfilter_sp_t ipfilter_sp[NUM_IPFILTER_S_P];
diff -r c355ba4b6711 -r be8373f115dd source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Nov 18 16:49:30 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp Mon Nov 18 17:51:16 2013 +0530
@@ -141,8 +141,7 @@
p.chroma_hps[CHROMA_ ## W ## x ## H] = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu; \
p.chroma_vpp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu; \
p.chroma_vps[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu; \
- p.chroma_copy_ps[CHROMA_ ## W ## x ## H] = x265_blockcopy_ps_ ## W ## x ## H ## cpu; \
- p.chroma_sub_ps[CHROMA_ ## W ## x ## H] = x265_pixel_sub_ps_ ## W ## x ## H ## cpu;
+ p.chroma_copy_ps[CHROMA_ ## W ## x ## H] = x265_blockcopy_ps_ ## W ## x ## H ## cpu;
#define SETUP_CHROMA_SP_FUNC_DEF(W, H, cpu) \
p.chroma_vsp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
@@ -272,6 +271,35 @@
SETUP_CHROMA_FROM_LUMA(64, 16, 32, 8, cpu); \
SETUP_CHROMA_FROM_LUMA(16, 64, 8, 32, cpu);
+#define SETUP_CHROMA_LUMA(W1, H1, W2, H2, cpu) \
+ p.chroma_sub_ps[X265_CSP_I420][LUMA_ ## W1 ## x ## H1] = x265_pixel_sub_ps_ ## W2 ## x ## H2 ## cpu;
+
+#define CHROMA_PIXELSUB_PS(cpu) \
+ SETUP_CHROMA_LUMA(8, 8, 4, 4, cpu); \
+ SETUP_CHROMA_LUMA(8, 4, 4, 2, cpu); \
+ SETUP_CHROMA_LUMA(4, 8, 2, 4, cpu); \
+ SETUP_CHROMA_LUMA(16, 16, 8, 8, cpu); \
+ SETUP_CHROMA_LUMA(16, 8, 8, 4, cpu); \
+ SETUP_CHROMA_LUMA(8, 16, 4, 8, cpu); \
+ SETUP_CHROMA_LUMA(16, 12, 8, 6, cpu); \
+ SETUP_CHROMA_LUMA(12, 16, 6, 8, cpu); \
+ SETUP_CHROMA_LUMA(16, 4, 8, 2, cpu); \
+ SETUP_CHROMA_LUMA(4, 16, 2, 8, cpu); \
+ SETUP_CHROMA_LUMA(32, 32, 16, 16, cpu); \
+ SETUP_CHROMA_LUMA(32, 16, 16, 8, cpu); \
+ SETUP_CHROMA_LUMA(16, 32, 8, 16, cpu); \
+ SETUP_CHROMA_LUMA(32, 24, 16, 12, cpu); \
+ SETUP_CHROMA_LUMA(24, 32, 12, 16, cpu); \
+ SETUP_CHROMA_LUMA(32, 8, 16, 4, cpu); \
+ SETUP_CHROMA_LUMA(8, 32, 4, 16, cpu); \
+ SETUP_CHROMA_LUMA(64, 64, 32, 32, cpu); \
+ SETUP_CHROMA_LUMA(64, 32, 32, 16, cpu); \
+ SETUP_CHROMA_LUMA(32, 64, 16, 32, cpu); \
+ SETUP_CHROMA_LUMA(64, 48, 32, 24, cpu); \
+ SETUP_CHROMA_LUMA(48, 64, 24, 32, cpu); \
+ SETUP_CHROMA_LUMA(64, 16, 32, 8, cpu); \
+ SETUP_CHROMA_LUMA(16, 64, 8, 32, cpu);
+
#define LUMA_FILTERS(cpu) \
SETUP_LUMA_FUNC_DEF(4, 4, cpu); \
SETUP_LUMA_FUNC_DEF(8, 8, cpu); \
@@ -589,6 +617,8 @@
p.sa8d[BLOCK_16x16] = x265_pixel_sa8d_16x16_sse4;
SA8D_INTER_FROM_BLOCK(sse4);
+ CHROMA_PIXELSUB_PS(_sse4);
+
CHROMA_FILTERS(_sse4);
LUMA_FILTERS(_sse4);
HEVC_SATD(sse4);
diff -r c355ba4b6711 -r be8373f115dd source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Mon Nov 18 16:49:30 2013 +0530
+++ b/source/test/pixelharness.cpp Mon Nov 18 17:51:16 2013 +0530
@@ -781,12 +781,15 @@
}
}
- if (opt.chroma_sub_ps[part])
+ for(int i = 0; i < NUM_CSP; i++)
{
- if (!check_pixel_sub_ps(ref.chroma_sub_ps[part], opt.chroma_sub_ps[part]))
+ if (opt.chroma_sub_ps[i][part])
{
- printf("chroma_sub_ps[%s] failed\n", chromaPartStr[part]);
- return false;
+ if (!check_pixel_sub_ps(ref.chroma_sub_ps[i][part], opt.chroma_sub_ps[i][part]))
+ {
+ printf("chroma_sub_ps[%s][%s] failed\n", colorSpaceNames[i], chromaPartStr[part]);
+ return false;
+ }
}
}
return true;
@@ -1063,10 +1066,13 @@
REPORT_SPEEDUP(opt.luma_sub_ps[part], ref.luma_sub_ps[part], (int16_t*)pbuf1, FENC_STRIDE, pbuf2, pbuf1, STRIDE, STRIDE);
}
- if (opt.chroma_sub_ps[part])
+ for (int i = 0; i < NUM_CSP; i++)
{
- printf("chroma_sub_ps[%s]", chromaPartStr[part]);
- REPORT_SPEEDUP(opt.chroma_sub_ps[part], ref.chroma_sub_ps[part], (int16_t*)pbuf1, FENC_STRIDE, pbuf2, pbuf1, STRIDE, STRIDE);
+ if (opt.chroma_sub_ps[i][part])
+ {
+ printf("chroma_sub_ps[%s][%s]", colorSpaceNames[i], chromaPartStr[part]);
+ REPORT_SPEEDUP(opt.chroma_sub_ps[i][part], ref.chroma_sub_ps[i][part], (int16_t*)pbuf1, FENC_STRIDE, pbuf2, pbuf1, STRIDE, STRIDE);
+ }
}
}
More information about the x265-devel
mailing list