[x265] [PATCH] Enable PropagateCost assembly primitive
gopi.satykrishna at multicorewareinc.com
gopi.satykrishna at multicorewareinc.com
Wed Aug 31 15:40:07 CEST 2016
# HG changeset patch
# User Gopi Satykrishna Akisetty <gopi.satykrishna at multicorewareinc.com>
# Date 1472623041 -19800
# Wed Aug 31 11:27:21 2016 +0530
# Node ID f4fd0ef8638a182e3faaccdd0591a536bcc43df2
# Parent 9e624761d74e996f885efb7f6ba51b6a6c691482
Enable PropagateCost assembly primitive
diff -r 9e624761d74e -r f4fd0ef8638a source/common/lowres.cpp
--- a/source/common/lowres.cpp Mon Aug 29 13:49:09 2016 +0530
+++ b/source/common/lowres.cpp Wed Aug 31 11:27:21 2016 +0530
@@ -45,7 +45,7 @@
if (qgSize == 8)
cuCountFullRes = maxBlocksInRowFullRes * maxBlocksInColFullRes;
else
- cuCountFullRes = maxBlocksInRow * maxBlocksInCol;
+ cuCountFullRes = cuCount;
/* rounding the width to multiple of lowres CU size */
width = maxBlocksInRow * X265_LOWRES_CU_SIZE;
@@ -60,6 +60,8 @@
CHECKED_MALLOC(invQscaleFactor, int, cuCountFullRes);
CHECKED_MALLOC(qpCuTreeOffset, double, cuCountFullRes);
CHECKED_MALLOC(blockVariance, uint32_t, cuCountFullRes);
+ if (qgSize == 8)
+ CHECKED_MALLOC(invQscaleFactor8x8, int, cuCount);
}
CHECKED_MALLOC(propagateCost, uint16_t, cuCount);
@@ -129,6 +131,7 @@
X265_FREE(qpCuTreeOffset);
X265_FREE(propagateCost);
X265_FREE(blockVariance);
+ X265_FREE(invQscaleFactor8x8);
}
// (re) initialize lowres state
diff -r 9e624761d74e -r f4fd0ef8638a source/common/lowres.h
--- a/source/common/lowres.h Mon Aug 29 13:49:09 2016 +0530
+++ b/source/common/lowres.h Wed Aug 31 11:27:21 2016 +0530
@@ -145,6 +145,7 @@
double* qpAqOffset; // AQ QP offset values for each 16x16 CU
double* qpCuTreeOffset; // cuTree QP offset values for each 16x16 CU
int* invQscaleFactor; // qScale values for qp Aq Offsets
+ int* invQscaleFactor8x8; // temporary buffer for qg-size 8
uint32_t* blockVariance;
uint64_t wp_ssd[3]; // This is different than SSDY, this is sum(pixel^2) - sum(pixel)^2 for entire frame
uint64_t wp_sum[3];
diff -r 9e624761d74e -r f4fd0ef8638a source/common/pixel.cpp
--- a/source/common/pixel.cpp Mon Aug 29 13:49:09 2016 +0530
+++ b/source/common/pixel.cpp Wed Aug 31 11:27:21 2016 +0530
@@ -845,58 +845,31 @@
/* Estimate the total amount of influence on future quality that could be had if we
* were to improve the reference samples used to inter predict any given CU. */
static void estimateCUPropagateCost(int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts,
- const int32_t* invQscales, const double* fpsFactor, int len, uint32_t qgSize)
+ const int32_t* invQscales, const double* fpsFactor, int len)
{
double fps = *fpsFactor / 256; // range[0.01, 1.00]
- if (qgSize == 8)
+ for (int i = 0; i < len; i++)
{
- for (int i = 0; i < len; i++)
- {
- int intraCost = intraCosts[i];
- int interCost = X265_MIN(intraCosts[i], interCosts[i] & LOWRES_COST_MASK);
- int invQscaleFactor = (invQscales[i * 2] + invQscales[i * 2 + 1] + invQscales[i * 2 + len * 2] + invQscales[i * 2 + len * 2 + 1]) / 4;
- double propagateIntra = intraCost * invQscaleFactor; // Q16 x Q8.8 = Q24.8
- double propagateAmount = (double)propagateIn[i] + propagateIntra * fps; // Q16.0 + Q24.8 x Q0.x = Q25.0
- double propagateNum = (double)(intraCost - interCost); // Q32 - Q32 = Q33.0
+ int intraCost = intraCosts[i];
+ int interCost = X265_MIN(intraCosts[i], interCosts[i] & LOWRES_COST_MASK);
+ double propagateIntra = intraCost * invQscales[i]; // Q16 x Q8.8 = Q24.8
+ double propagateAmount = (double)propagateIn[i] + propagateIntra * fps; // Q16.0 + Q24.8 x Q0.x = Q25.0
+ double propagateNum = (double)(intraCost - interCost); // Q32 - Q32 = Q33.0
#if 0
- // algorithm that output match to asm
- float intraRcp = (float)1.0f / intraCost; // VC can't mapping this into RCPPS
- float intraRcpError1 = (float)intraCost * (float)intraRcp;
- intraRcpError1 *= (float)intraRcp;
- float intraRcpError2 = intraRcp + intraRcp;
- float propagateDenom = intraRcpError2 - intraRcpError1;
- dst[i] = (int)(propagateAmount * propagateNum * (double)propagateDenom + 0.5);
+ // algorithm that output match to asm
+ float intraRcp = (float)1.0f / intraCost; // VC can't mapping this into RCPPS
+ float intraRcpError1 = (float)intraCost * (float)intraRcp;
+ intraRcpError1 *= (float)intraRcp;
+ float intraRcpError2 = intraRcp + intraRcp;
+ float propagateDenom = intraRcpError2 - intraRcpError1;
+ dst[i] = (int)(propagateAmount * propagateNum * (double)propagateDenom + 0.5);
#else
- double propagateDenom = (double)intraCost; // Q32
- dst[i] = (int)(propagateAmount * propagateNum / propagateDenom + 0.5);
+ double propagateDenom = (double)intraCost; // Q32
+ dst[i] = (int)(propagateAmount * propagateNum / propagateDenom + 0.5);
#endif
}
- }
- else
- {
- for (int i = 0; i < len; i++)
- {
- int intraCost = intraCosts[i];
- int interCost = X265_MIN(intraCosts[i], interCosts[i] & LOWRES_COST_MASK);
- double propagateIntra = intraCost * invQscales[i]; // Q16 x Q8.8 = Q24.8
- double propagateAmount = (double)propagateIn[i] + propagateIntra * fps; // Q16.0 + Q24.8 x Q0.x = Q25.0
- double propagateNum = (double)(intraCost - interCost); // Q32 - Q32 = Q33.0
-
-#if 0
- // algorithm that output match to asm
- float intraRcp = (float)1.0f / intraCost; // VC can't mapping this into RCPPS
- float intraRcpError1 = (float)intraCost * (float)intraRcp;
- intraRcpError1 *= (float)intraRcp;
- float intraRcpError2 = intraRcp + intraRcp;
- float propagateDenom = intraRcpError2 - intraRcpError1;
- dst[i] = (int)(propagateAmount * propagateNum * (double)propagateDenom + 0.5);
-#else
- double propagateDenom = (double)intraCost; // Q32
- dst[i] = (int)(propagateAmount * propagateNum / propagateDenom + 0.5);
-#endif
- }
- }
+ //}
}
/* Conversion between double and Q8.8 fixed point (big-endian) for storage */
diff -r 9e624761d74e -r f4fd0ef8638a source/common/primitives.h
--- a/source/common/primitives.h Mon Aug 29 13:49:09 2016 +0530
+++ b/source/common/primitives.h Wed Aug 31 11:27:21 2016 +0530
@@ -187,7 +187,7 @@
typedef void (*planecopy_sp_t) (const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
typedef pixel (*planeClipAndMax_t)(pixel *src, intptr_t stride, int width, int height, uint64_t *outsum, const pixel minPix, const pixel maxPix);
-typedef void (*cutree_propagate_cost) (int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts, const int32_t* invQscales, const double* fpsFactor, int len, uint32_t qgSize);
+typedef void (*cutree_propagate_cost) (int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts, const int32_t* invQscales, const double* fpsFactor, int len);
typedef void (*cutree_fix8_unpack)(double *dst, uint16_t *src, int count);
typedef void (*cutree_fix8_pack)(uint16_t *dst, double *src, int count);
diff -r 9e624761d74e -r f4fd0ef8638a source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Aug 29 13:49:09 2016 +0530
+++ b/source/common/x86/asm-primitives.cpp Wed Aug 31 11:27:21 2016 +0530
@@ -1027,7 +1027,7 @@
ALL_CHROMA_444_PU(p2s, filterPixelToShort, sse2);
ALL_LUMA_PU(convert_p2s, filterPixelToShort, sse2);
ALL_LUMA_TU(count_nonzero, count_nonzero, sse2);
- //p.propagateCost = PFX(mbtree_propagate_cost_sse2);
+ p.propagateCost = PFX(mbtree_propagate_cost_sse2);
}
if (cpuMask & X265_CPU_SSE3)
{
@@ -1312,7 +1312,7 @@
p.pu[LUMA_64x32].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x32_avx);
p.pu[LUMA_64x48].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x48_avx);
p.pu[LUMA_64x64].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x64_avx);
- //p.propagateCost = PFX(mbtree_propagate_cost_avx);
+ p.propagateCost = PFX(mbtree_propagate_cost_avx);
}
if (cpuMask & X265_CPU_XOP)
{
@@ -2153,7 +2153,7 @@
p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vsp = PFX(interp_4tap_vert_sp_64x64_avx2);
p.frameInitLowres = PFX(frame_init_lowres_core_avx2);
- //p.propagateCost = PFX(mbtree_propagate_cost_avx2);
+ p.propagateCost = PFX(mbtree_propagate_cost_avx2);
p.fix8Unpack = PFX(cutree_fix8_unpack_avx2);
p.fix8Pack = PFX(cutree_fix8_pack_avx2);
@@ -2356,7 +2356,7 @@
ALL_CHROMA_444_PU(p2s, filterPixelToShort, sse2);
ALL_LUMA_PU(convert_p2s, filterPixelToShort, sse2);
ALL_LUMA_TU(count_nonzero, count_nonzero, sse2);
- //p.propagateCost = PFX(mbtree_propagate_cost_sse2);
+ p.propagateCost = PFX(mbtree_propagate_cost_sse2);
}
if (cpuMask & X265_CPU_SSE3)
{
@@ -2670,7 +2670,7 @@
p.pu[LUMA_48x64].copy_pp = PFX(blockcopy_pp_48x64_avx);
p.frameInitLowres = PFX(frame_init_lowres_core_avx);
- //p.propagateCost = PFX(mbtree_propagate_cost_avx);
+ p.propagateCost = PFX(mbtree_propagate_cost_avx);
}
if (cpuMask & X265_CPU_XOP)
{
@@ -3666,7 +3666,7 @@
p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vpp = PFX(interp_4tap_vert_pp_64x16_avx2);
p.frameInitLowres = PFX(frame_init_lowres_core_avx2);
- //p.propagateCost = PFX(mbtree_propagate_cost_avx2);
+ p.propagateCost = PFX(mbtree_propagate_cost_avx2);
p.saoCuStatsE0 = PFX(saoCuStatsE0_avx2);
p.saoCuStatsE1 = PFX(saoCuStatsE1_avx2);
p.saoCuStatsE2 = PFX(saoCuStatsE2_avx2);
diff -r 9e624761d74e -r f4fd0ef8638a source/common/x86/mc-a2.asm
--- a/source/common/x86/mc-a2.asm Mon Aug 29 13:49:09 2016 +0530
+++ b/source/common/x86/mc-a2.asm Wed Aug 31 11:27:21 2016 +0530
@@ -994,7 +994,7 @@
;-----------------------------------------------------------------------------
; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, int32_t *intra_costs,
-; uint16_t *inter_costs, int32_t *inv_qscales, double *fps_factor, int len, uint32_t qgSize)
+; uint16_t *inter_costs, int32_t *inv_qscales, double *fps_factor, int len )
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal mbtree_propagate_cost, 7,7,7
diff -r 9e624761d74e -r f4fd0ef8638a source/encoder/slicetype.cpp
--- a/source/encoder/slicetype.cpp Mon Aug 29 13:49:09 2016 +0530
+++ b/source/encoder/slicetype.cpp Wed Aug 31 11:27:21 2016 +0530
@@ -237,6 +237,21 @@
}
}
+ if (param->rc.qgSize == 8)
+ {
+ for (int cuY = 0; cuY < heightInCU; cuY++)
+ {
+ for (int cuX = 0; cuX < widthInCU; cuX++)
+ {
+ const int cuXY = cuX + cuY * widthInCU;
+ curFrame->m_lowres.invQscaleFactor8x8[cuXY] = (curFrame->m_lowres.invQscaleFactor[cuX * 2 + cuY * widthInCU * 4] +
+ curFrame->m_lowres.invQscaleFactor[cuX * 2 + cuY * widthInCU * 4 + 1] +
+ curFrame->m_lowres.invQscaleFactor[cuX * 2 + cuY * widthInCU * 4 + curFrame->m_lowres.maxBlocksInRowFullRes] +
+ curFrame->m_lowres.invQscaleFactor[cuX * 2 + cuY * widthInCU * 4 + curFrame->m_lowres.maxBlocksInRowFullRes + 1]) / 4;
+ }
+ }
+ }
+
if (param->bEnableWeightedPred || param->bEnableWeightedBiPred)
{
int hShift = CHROMA_H_SHIFT(param->internalCsp);
@@ -347,15 +362,9 @@
frame cost estimates, they are not very accurate */
const bool bFrameScoreCU = (cuX > 0 && cuX < widthInCU - 1 &&
cuY > 0 && cuY < heightInCU - 1) || widthInCU <= 2 || heightInCU <= 2;
- int invQscaleFactor, icostAq;
+ int icostAq;
if (qgSize == 8)
- {
- invQscaleFactor = (fenc.invQscaleFactor[cuX * 2 + cuY * widthInCU * 4] +
- fenc.invQscaleFactor[cuX * 2 + cuY * widthInCU * 4 + 1] +
- fenc.invQscaleFactor[cuX * 2 + cuY * widthInCU * 4 + fenc.maxBlocksInRowFullRes] +
- fenc.invQscaleFactor[cuX * 2 + cuY * widthInCU * 4 + fenc.maxBlocksInRowFullRes + 1]) / 4;
- icostAq = (bFrameScoreCU && fenc.invQscaleFactor) ? ((icost * invQscaleFactor + 128) >> 8) : icost;
- }
+ icostAq = (bFrameScoreCU && fenc.invQscaleFactor) ? ((icost * fenc.invQscaleFactor8x8[cuXY] + 128) >> 8) : icost;
else
icostAq = (bFrameScoreCU && fenc.invQscaleFactor) ? ((icost * fenc.invQscaleFactor[cuXY] +128) >> 8) : icost;
@@ -1816,11 +1825,11 @@
if (m_param->rc.qgSize == 8)
primitives.propagateCost(m_scratch, propagateCost,
frames[b]->intraCost + cuIndex, frames[b]->lowresCosts[b - p0][p1 - b] + cuIndex,
- frames[b]->invQscaleFactor + (cuIndex * 4), &fpsFactor, m_8x8Width, m_param->rc.qgSize);
+ frames[b]->invQscaleFactor8x8 + cuIndex, &fpsFactor, m_8x8Width);
else
primitives.propagateCost(m_scratch, propagateCost,
frames[b]->intraCost + cuIndex, frames[b]->lowresCosts[b - p0][p1 - b] + cuIndex,
- frames[b]->invQscaleFactor + cuIndex, &fpsFactor, m_8x8Width, m_param->rc.qgSize);
+ frames[b]->invQscaleFactor + cuIndex, &fpsFactor, m_8x8Width);
if (referenced)
propagateCost += m_8x8Width;
@@ -1913,12 +1922,7 @@
for (int cuX = 0; cuX < m_8x8Width; cuX++)
{
const int cuXY = cuX + cuY * m_8x8Width;
- int invQscaleFactor = (frame->invQscaleFactor[cuX * 2 + cuY * m_8x8Width * 4] +
- frame->invQscaleFactor[cuX * 2 + cuY * m_8x8Width * 4 + 1] +
- frame->invQscaleFactor[cuX * 2 + cuY * m_8x8Width * 4 + frame->maxBlocksInRowFullRes] +
- frame->invQscaleFactor[cuX * 2 + cuY * m_8x8Width * 4 + frame->maxBlocksInRowFullRes + 1]) / 4;
-
- int intracost = ((frame->intraCost[cuXY]) / 4 * invQscaleFactor + 128) >> 8;
+ int intracost = ((frame->intraCost[cuXY]) / 4 * frame->invQscaleFactor8x8[cuXY] + 128) >> 8;
if (intracost)
{
int propagateCost = ((frame->propagateCost[cuXY]) / 4 * fpsFactor + 128) >> 8;
@@ -1928,7 +1932,6 @@
frame->qpCuTreeOffset[cuX * 2 + cuY * m_8x8Width * 4 + frame->maxBlocksInRowFullRes] = frame->qpAqOffset[cuX * 2 + cuY * m_8x8Width * 4 + frame->maxBlocksInRowFullRes] - m_cuTreeStrength * (log2_ratio);
frame->qpCuTreeOffset[cuX * 2 + cuY * m_8x8Width * 4 + frame->maxBlocksInRowFullRes + 1] = frame->qpAqOffset[cuX * 2 + cuY * m_8x8Width * 4 + frame->maxBlocksInRowFullRes + 1] - m_cuTreeStrength * (log2_ratio);
}
-
}
}
}
@@ -2292,15 +2295,9 @@
/* do not include edge blocks in the frame cost estimates, they are not very accurate */
const bool bFrameScoreCU = (cuX > 0 && cuX < widthInCU - 1 &&
cuY > 0 && cuY < heightInCU - 1) || widthInCU <= 2 || heightInCU <= 2;
- int invQscaleFactor, bcostAq;
+ int bcostAq;
if (m_lookahead.m_param->rc.qgSize == 8)
- {
- invQscaleFactor = (fenc->invQscaleFactor[cuX * 2 + cuY * widthInCU * 4] +
- fenc->invQscaleFactor[cuX * 2 + cuY * widthInCU * 4 + 1] +
- fenc->invQscaleFactor[cuX * 2 + cuY * widthInCU * 4 + fenc->maxBlocksInRowFullRes] +
- fenc->invQscaleFactor[cuX * 2 + cuY * widthInCU * 4 + fenc->maxBlocksInRowFullRes + 1]) / 4;
- bcostAq = (bFrameScoreCU && fenc->invQscaleFactor) ? ((bcost * invQscaleFactor + 128) >> 8) : bcost;
- }
+ bcostAq = (bFrameScoreCU && fenc->invQscaleFactor) ? ((bcost * fenc->invQscaleFactor8x8[cuXY] + 128) >> 8) : bcost;
else
bcostAq = (bFrameScoreCU && fenc->invQscaleFactor) ? ((bcost * fenc->invQscaleFactor[cuXY] +128) >> 8) : bcost;
diff -r 9e624761d74e -r f4fd0ef8638a source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Mon Aug 29 13:49:09 2016 +0530
+++ b/source/test/pixelharness.cpp Wed Aug 31 11:27:21 2016 +0530
@@ -1387,8 +1387,8 @@
{
int width = 16 + rand() % 64;
int index = i % TEST_CASES;
- checked(opt, opt_dest, ushort_test_buff[index] + j, int_test_buff[index] + j, ushort_test_buff[index] + j, int_test_buff[index] + j, &fps, width, 32);
- ref(ref_dest, ushort_test_buff[index] + j, int_test_buff[index] + j, ushort_test_buff[index] + j, int_test_buff[index] + j, &fps, width, 32);
+ checked(opt, opt_dest, ushort_test_buff[index] + j, int_test_buff[index] + j, ushort_test_buff[index] + j, int_test_buff[index] + j, &fps, width);
+ ref(ref_dest, ushort_test_buff[index] + j, int_test_buff[index] + j, ushort_test_buff[index] + j, int_test_buff[index] + j, &fps, width);
if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
return false;
@@ -3102,7 +3102,7 @@
if (opt.propagateCost)
{
HEADER0("propagateCost");
- REPORT_SPEEDUP(opt.propagateCost, ref.propagateCost, ibuf1, ushort_test_buff[0], int_test_buff[0], ushort_test_buff[0], int_test_buff[0], double_test_buff[0], 80, 32);
+ REPORT_SPEEDUP(opt.propagateCost, ref.propagateCost, ibuf1, ushort_test_buff[0], int_test_buff[0], ushort_test_buff[0], int_test_buff[0], double_test_buff[0], 80);
}
if (opt.fix8Pack)
diff -r 9e624761d74e -r f4fd0ef8638a source/x265.h
--- a/source/x265.h Mon Aug 29 13:49:09 2016 +0530
+++ b/source/x265.h Wed Aug 31 11:27:21 2016 +0530
@@ -263,8 +263,8 @@
/* An array of quantizer offsets to be applied to this image during encoding.
* These are added on top of the decisions made by rateControl.
* Adaptive quantization must be enabled to use this feature. These quantizer
- * offsets should be given for each 16x16 block. Behavior if quant
- * offsets differ between encoding passes is undefined. */
+ * offsets should be given for each 16x16 block (8x8 block, when qg-size is 8).
+ * Behavior if quant offsets differ between encoding passes is undefined. */
float *quantOffsets;
/* Frame level statistics */
More information about the x265-devel
mailing list