[x265] [PATCH] TComYuv::copyFromPicLuma, asm integration for chroma blockcopy_pp
praveen at multicorewareinc.com
praveen at multicorewareinc.com
Fri Nov 15 13:19:52 CET 2013
# HG changeset patch
# User Praveen Tiwari
# Date 1384517979 -19800
# Node ID bd9aa48dcfb7c44f0d1bb873d6965f4d3dbf7433
# Parent 2307c52f11b2e6c7b21d94c207d98f9fac8e1ab9
TComYuv::copyFromPicLuma, asm integration for chroma blockcopy_pp
diff -r 2307c52f11b2 -r bd9aa48dcfb7 source/Lib/TLibCommon/TComYuv.cpp
--- a/source/Lib/TLibCommon/TComYuv.cpp Fri Nov 15 12:17:31 2013 +0530
+++ b/source/Lib/TLibCommon/TComYuv.cpp Fri Nov 15 17:49:39 2013 +0530
@@ -78,6 +78,9 @@
m_cwidth = width >> m_hChromaShift;
m_cheight = height >> m_vChromaShift;
+
+ m_csp = csp;
+ m_part = partitionFromSizes(m_width, m_height);
}
void TComYuv::destroy()
@@ -155,8 +158,7 @@
uint32_t dststride = getStride();
uint32_t srcstride = srcPicYuv->getStride();
- int part = partitionFromSizes(m_width, m_height);
- primitives.luma_copy_pp[part](dst, dststride, src, srcstride);
+ primitives.luma_copy_pp[m_part](dst, dststride, src, srcstride);
}
void TComYuv::copyFromPicChroma(TComPicYuv* srcPicYuv, uint32_t cuAddr, uint32_t absZOrderIdx)
@@ -169,8 +171,8 @@
uint32_t dststride = getCStride();
uint32_t srcstride = srcPicYuv->getCStride();
- primitives.blockcpy_pp(m_cwidth, m_cheight, dstU, dststride, srcU, srcstride);
- primitives.blockcpy_pp(m_cwidth, m_cheight, dstV, dststride, srcV, srcstride);
+ primitives.chroma_copy_pp[m_csp][m_part](dstU, dststride, srcU, srcstride);
+ primitives.chroma_copy_pp[m_csp][m_part](dstV, dststride, srcV, srcstride);
}
void TComYuv::copyToPartYuv(TComYuv* dstPicYuv, uint32_t uiDstPartIdx)
diff -r 2307c52f11b2 -r bd9aa48dcfb7 source/Lib/TLibCommon/TComYuv.h
--- a/source/Lib/TLibCommon/TComYuv.h Fri Nov 15 12:17:31 2013 +0530
+++ b/source/Lib/TLibCommon/TComYuv.h Fri Nov 15 17:49:39 2013 +0530
@@ -80,6 +80,9 @@
int m_hChromaShift;
int m_vChromaShift;
+ int m_csp;
+ int m_part; // This will eliminate all calls to part = partitionFromSizes(m_width, m_height);
+
static int getAddrOffset(uint32_t partUnitIdx, uint32_t width)
{
int blkX = g_rasterToPelX[g_zscanToRaster[partUnitIdx]];
diff -r 2307c52f11b2 -r bd9aa48dcfb7 source/common/pixel.cpp
--- a/source/common/pixel.cpp Fri Nov 15 12:17:31 2013 +0530
+++ b/source/common/pixel.cpp Fri Nov 15 17:49:39 2013 +0530
@@ -835,7 +835,7 @@
p.satd[LUMA_16x64] = satd8<16, 64>;
#define CHROMA(W, H) \
- p.chroma_copy_pp[CHROMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
+ p.chroma_copy_pp[CSP_I420][CHROMA_ ## W ## x ## H] = blockcopy_pp_c<W, H>; \
p.chroma_copy_sp[CHROMA_ ## W ## x ## H] = blockcopy_sp_c<W, H>; \
p.chroma_copy_ps[CHROMA_ ## W ## x ## H] = blockcopy_ps_c<W, H>;\
p.chroma_sub_ps[CHROMA_ ## W ## x ## H] = pixel_sub_ps_c<W, H>;
diff -r 2307c52f11b2 -r bd9aa48dcfb7 source/common/primitives.h
--- a/source/common/primitives.h Fri Nov 15 12:17:31 2013 +0530
+++ b/source/common/primitives.h Fri Nov 15 17:49:39 2013 +0530
@@ -136,6 +136,14 @@
NUM_IPFILTER_S_S
};
+// This enum maps it's members to actual csp values
+enum ColorspaceType
+{
+ CSP_NONE,
+ CSP_I420,
+ NUM_CSP
+};
+
// Returns a LumaPartitions enum for the given size, always expected to return a valid enum
inline int partitionFromSizes(int width, int height)
{
@@ -232,7 +240,7 @@
cvt32to16_shr_t cvt32to16_shr;
copy_pp_t luma_copy_pp[NUM_LUMA_PARTITIONS];
- copy_pp_t chroma_copy_pp[NUM_CHROMA_PARTITIONS];
+ copy_pp_t chroma_copy_pp[NUM_CSP][NUM_CHROMA_PARTITIONS];
copy_sp_t luma_copy_sp[NUM_LUMA_PARTITIONS];
copy_sp_t chroma_copy_sp[NUM_CHROMA_PARTITIONS];
copy_ps_t luma_copy_ps[NUM_LUMA_PARTITIONS];
diff -r 2307c52f11b2 -r bd9aa48dcfb7 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Fri Nov 15 12:17:31 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp Fri Nov 15 17:49:39 2013 +0530
@@ -145,9 +145,6 @@
#define SETUP_CHROMA_SP_FUNC_DEF(W, H, cpu) \
p.chroma_vsp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
-#define SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(W, H, cpu) \
- p.chroma_copy_pp[CHROMA_ ## W ## x ## H] = x265_blockcopy_pp_ ## W ## x ## H ## cpu;
-
#define CHROMA_FILTERS(cpu) \
SETUP_CHROMA_FUNC_DEF(4, 4, cpu); \
SETUP_CHROMA_FUNC_DEF(4, 2, cpu); \
@@ -197,32 +194,6 @@
SETUP_CHROMA_SP_FUNC_DEF(32, 8, cpu); \
SETUP_CHROMA_SP_FUNC_DEF(8, 32, cpu);
-#define CHROMA_BLOCKCOPY(cpu) \
- SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(4, 4, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(4, 2, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(2, 4, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(8, 8, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(8, 4, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(4, 8, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(8, 6, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(6, 8, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(8, 2, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(2, 8, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(16, 16, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(16, 8, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(8, 16, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(16, 12, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(12, 16, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(16, 4, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(4, 16, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(32, 32, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(32, 16, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(16, 32, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(32, 24, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(24, 32, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(32, 8, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(8, 32, cpu);
-
#define SETUP_LUMA_FUNC_DEF(W, H, cpu) \
p.luma_hpp[LUMA_ ## W ## x ## H] = x265_interp_8tap_horiz_pp_ ## W ## x ## H ## cpu; \
p.luma_hps[LUMA_ ## W ## x ## H] = x265_interp_8tap_horiz_ps_ ## W ## x ## H ## cpu; \
@@ -237,6 +208,36 @@
#define SETUP_LUMA_BLOCKCOPY_FUNC_DEF(W, H, cpu) \
p.luma_copy_pp[LUMA_ ## W ## x ## H] = x265_blockcopy_pp_ ## W ## x ## H ## cpu;
+#define SETUP_CHROMA_FROM_LUMA(W1, H1, W2, H2, cpu) \
+ p.chroma_copy_pp[X265_CSP_I420][LUMA_ ## W1 ## x ## H1] = x265_blockcopy_pp_ ## W2 ## x ## H2 ## cpu;
+
+// For X265_CSP_I420 chroma width and height will be half of luma width and height
+#define CHROMA_BLOCKCOPY(cpu) \
+ SETUP_CHROMA_FROM_LUMA(8, 8, 4, 4, cpu); \
+ SETUP_CHROMA_FROM_LUMA(8, 4, 4, 2, cpu); \
+ SETUP_CHROMA_FROM_LUMA(4, 8, 2, 4, cpu); \
+ SETUP_CHROMA_FROM_LUMA(16, 16, 8, 8, cpu); \
+ SETUP_CHROMA_FROM_LUMA(16, 8, 8, 4, cpu); \
+ SETUP_CHROMA_FROM_LUMA(8, 16, 4, 8, cpu); \
+ SETUP_CHROMA_FROM_LUMA(16, 12, 8, 6, cpu); \
+ SETUP_CHROMA_FROM_LUMA(12, 16, 6, 8, cpu); \
+ SETUP_CHROMA_FROM_LUMA(16, 4, 8, 2, cpu); \
+ SETUP_CHROMA_FROM_LUMA(4, 16, 2, 8, cpu); \
+ SETUP_CHROMA_FROM_LUMA(32, 32, 16, 16, cpu); \
+ SETUP_CHROMA_FROM_LUMA(32, 16, 16, 8, cpu); \
+ SETUP_CHROMA_FROM_LUMA(16, 32, 8, 16, cpu); \
+ SETUP_CHROMA_FROM_LUMA(32, 24, 16, 12, cpu); \
+ SETUP_CHROMA_FROM_LUMA(24, 32, 12, 16, cpu); \
+ SETUP_CHROMA_FROM_LUMA(32, 8, 16, 4, cpu); \
+ SETUP_CHROMA_FROM_LUMA(8, 32, 4, 16, cpu); \
+ SETUP_CHROMA_FROM_LUMA(64, 64, 32, 32, cpu); \
+ SETUP_CHROMA_FROM_LUMA(64, 32, 32, 16, cpu); \
+ SETUP_CHROMA_FROM_LUMA(32, 64, 16, 32, cpu); \
+ SETUP_CHROMA_FROM_LUMA(64, 48, 32, 24, cpu); \
+ SETUP_CHROMA_FROM_LUMA(48, 64, 24, 32, cpu); \
+ SETUP_CHROMA_FROM_LUMA(64, 16, 32, 8, cpu); \
+ SETUP_CHROMA_FROM_LUMA(16, 64, 8, 32, cpu);
+
#define LUMA_FILTERS(cpu) \
SETUP_LUMA_FUNC_DEF(4, 4, cpu); \
SETUP_LUMA_FUNC_DEF(8, 8, cpu); \
diff -r 2307c52f11b2 -r bd9aa48dcfb7 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Fri Nov 15 12:17:31 2013 +0530
+++ b/source/test/pixelharness.cpp Fri Nov 15 17:49:39 2013 +0530
@@ -724,11 +724,11 @@
}
}
- if (opt.chroma_copy_pp[part])
+ if (opt.chroma_copy_pp[CSP_I420][part])
{
- if (!check_block_copy_pp(ref.chroma_copy_pp[part], opt.chroma_copy_pp[part]))
+ if (!check_block_copy_pp(ref.chroma_copy_pp[CSP_I420][part], opt.chroma_copy_pp[CSP_I420][part]))
{
- printf("chroma_copy_pp[%s] failed\n", chromaPartStr[part]);
+ printf("chroma_copy_pp[%s][%s] failed\n", "CSP_I420", chromaPartStr[part]);
return false;
}
}
@@ -1021,10 +1021,10 @@
REPORT_SPEEDUP(opt.luma_copy_pp[part], ref.luma_copy_pp[part], pbuf1, 64, pbuf2, 128);
}
- if (opt.chroma_copy_pp[part])
+ if (opt.chroma_copy_pp[CSP_I420][part])
{
- printf("ccpy_pp[%s]", chromaPartStr[part]);
- REPORT_SPEEDUP(opt.chroma_copy_pp[part], ref.chroma_copy_pp[part], pbuf1, 64, pbuf2, 128);
+ printf("ccpy_pp[%s][%s]", "CSP_I420", chromaPartStr[part]);
+ REPORT_SPEEDUP(opt.chroma_copy_pp[CSP_I420][part], ref.chroma_copy_pp[CSP_I420][part], pbuf1, 64, pbuf2, 128);
}
if (opt.luma_copy_sp[part])
More information about the x265-devel
mailing list