[x265] Custom LowRes scale
Nicolas Morey-Chaisemartin
nmorey at kalray.eu
Mon Jul 21 17:19:17 CEST 2014
Hi,
We recently profiled x265 pre-analysis to estimate what performance we could reach using our accelerator and I was quite disappointed by the performance.
When running on a Core-i7 with AVX at roughly 2.7GHz, we barely reached the 30fps mark using ultrafast preset on a 4K video.
After a little bit of browsing I realized that work in LosRew is always done at 1/4th of the final resolution which seems fair but requires a huge amount of work for 4K.
It seemed straight forward enough to change the divider at LowRes initialization but it seems there are a lot of hard coded values that depend both on the LowRes divider and the LowRes CU Size.
Here's a patch (definitly not applicable like this but just to give an idea of where I'm going) that seems to fix most of the hard-coded value.
It still works with a X265_LOWRES_SCALE of 4 and the perf is definilty improving (29fps => 40fps on a 2048x1024 medium preset on a E5504).
Would you be interested in a clean version of this? At least the hard-coded CU_SIZE part?
IMHO it would be better to have "dynamic" value for LowRes depending on preset (or equivalent) and the input resolution...
1/4th is fast enough in HD not to be an issue but for RT stream in 4K or more, 1/16 will be compulsory.
Nicolas
---
x265/source/common/common.h | 1 +
x265/source/common/lowres.cpp | 4 ++--
x265/source/encoder/frameencoder.cpp | 7 ++++---
x265/source/encoder/ratecontrol.cpp | 16 ++++++++--------
x265/source/encoder/slicetype.cpp | 8 ++++----
5 files changed, 19 insertions(+), 17 deletions(-)
diff --git a/x265/source/common/common.h b/x265/source/common/common.h
index 06f60e7..00e73fc 100644
--- a/x265/source/common/common.h
+++ b/x265/source/common/common.h
@@ -156,6 +156,7 @@ typedef int32_t coeff_t; // transform coefficient
// high cost estimates (intra and inter both suffer)
#define X265_LOWRES_CU_SIZE 8
#define X265_LOWRES_CU_BITS 3
+#define X265_LOWRES_SCALE 2
#define X265_MALLOC(type, count) (type*)x265_malloc(sizeof(type) * (count))
#define X265_FREE(ptr) x265_free(ptr)
diff --git a/x265/source/common/lowres.cpp b/x265/source/common/lowres.cpp
index 5fc2f6b..6138023 100644
--- a/x265/source/common/lowres.cpp
+++ b/x265/source/common/lowres.cpp
@@ -31,8 +31,8 @@ bool Lowres::create(TComPicYuv *orig, int _bframes, bool bAQEnabled)
{
isLowres = true;
bframes = _bframes;
- width = orig->getWidth() / 2;
- lines = orig->getHeight() / 2;
+ width = orig->getWidth() / X265_LOWRES_SCALE;
+ lines = orig->getHeight() / X265_LOWRES_SCALE;
lumaStride = width + 2 * orig->getLumaMarginX();
if (lumaStride & 31)
lumaStride += 32 - (lumaStride & 31);
diff --git a/x265/source/encoder/frameencoder.cpp b/x265/source/encoder/frameencoder.cpp
index 8c3ee26..7213f60 100644
--- a/x265/source/encoder/frameencoder.cpp
+++ b/x265/source/encoder/frameencoder.cpp
@@ -1300,9 +1300,10 @@ int FrameEncoder::calcQpForCu(uint32_t cuAddr, double baseQp)
/* Derive qpOffet for each CU by averaging offsets for all 16x16 blocks in the cu. */
double qp_offset = 0;
- int maxBlockCols = (m_frame->getPicYuvOrg()->getWidth() + (16 - 1)) / 16;
- int maxBlockRows = (m_frame->getPicYuvOrg()->getHeight() + (16 - 1)) / 16;
- int noOfBlocks = g_maxCUSize / 16;
+ int lowResCu = (X265_LOWRES_CU_SIZE * X265_LOWRES_SCALE);
+ int maxBlockCols = (m_frame->getPicYuvOrg()->getWidth() + (lowResCu - 1)) / lowResCu;
+ int maxBlockRows = (m_frame->getPicYuvOrg()->getHeight() + (lowResCu - 1)) / lowResCu;
+ int noOfBlocks = g_maxCUSize / lowResCu;
int block_y = (cuAddr / m_frame->getPicSym()->getFrameWidthInCU()) * noOfBlocks;
int block_x = (cuAddr * noOfBlocks) - block_y * m_frame->getPicSym()->getFrameWidthInCU();
diff --git a/x265/source/encoder/ratecontrol.cpp b/x265/source/encoder/ratecontrol.cpp
index 4358994..5fcc27a 100644
--- a/x265/source/encoder/ratecontrol.cpp
+++ b/x265/source/encoder/ratecontrol.cpp
@@ -161,8 +161,8 @@ void RateControl::calcAdaptiveQuantFrame(Frame *pic)
if (m_param->rc.aqMode == X265_AQ_NONE || m_param->rc.aqStrength == 0)
{
/* Need to init it anyways for CU tree */
- int cuWidth = ((maxCol / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
- int cuHeight = ((maxRow / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
+ int cuWidth = ((maxCol / X265_LOWRES_SCALE) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
+ int cuHeight = ((maxRow / X265_LOWRES_SCALE) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
int cuCount = cuWidth * cuHeight;
if (m_param->rc.aqMode && m_param->rc.aqStrength == 0)
@@ -194,9 +194,9 @@ void RateControl::calcAdaptiveQuantFrame(Frame *pic)
if (m_param->rc.aqMode == X265_AQ_AUTO_VARIANCE)
{
double bit_depth_correction = pow(1 << (X265_DEPTH - 8), 0.5);
- for (block_y = 0; block_y < maxRow; block_y += 16)
+ for (block_y = 0; block_y < maxRow; block_y += (X265_LOWRES_CU_SIZE * X265_LOWRES_SCALE))
{
- for (block_x = 0; block_x < maxCol; block_x += 16)
+ for (block_x = 0; block_x < maxCol; block_x += (X265_LOWRES_CU_SIZE * X265_LOWRES_SCALE))
{
uint32_t energy = acEnergyCu(pic, block_x, block_y);
qp_adj = pow(energy + 1, 0.1);
@@ -216,9 +216,9 @@ void RateControl::calcAdaptiveQuantFrame(Frame *pic)
strength = m_param->rc.aqStrength * 1.0397f;
block_xy = 0;
- for (block_y = 0; block_y < maxRow; block_y += 16)
+ for (block_y = 0; block_y < maxRow; block_y += (X265_LOWRES_CU_SIZE * X265_LOWRES_SCALE))
{
- for (block_x = 0; block_x < maxCol; block_x += 16)
+ for (block_x = 0; block_x < maxCol; block_x += (X265_LOWRES_CU_SIZE * X265_LOWRES_SCALE))
{
if (m_param->rc.aqMode == X265_AQ_AUTO_VARIANCE)
{
@@ -260,8 +260,8 @@ void RateControl::calcAdaptiveQuantFrame(Frame *pic)
RateControl::RateControl(x265_param *p)
{
m_param = p;
- int lowresCuWidth = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
- int lowresCuHeight = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
+ int lowresCuWidth = ((m_param->sourceWidth / X265_LOWRES_SCALE) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
+ int lowresCuHeight = ((m_param->sourceHeight / X265_LOWRES_SCALE) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
m_ncu = lowresCuWidth * lowresCuHeight;
if (m_param->rc.cuTree)
diff --git a/x265/source/encoder/slicetype.cpp b/x265/source/encoder/slicetype.cpp
index 34d0b3b..4a2f2cb 100644
--- a/x265/source/encoder/slicetype.cpp
+++ b/x265/source/encoder/slicetype.cpp
@@ -65,8 +65,8 @@ Lookahead::Lookahead(x265_param *param, ThreadPool* pool)
m_lastNonB = NULL;
m_bFilling = true;
m_bFlushed = false;
- m_widthInCU = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
- m_heightInCU = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
+ m_widthInCU = ((m_param->sourceWidth / X265_LOWRES_SCALE) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
+ m_heightInCU = ((m_param->sourceHeight / X265_LOWRES_SCALE) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
m_scratch = (int*)x265_malloc(m_widthInCU * sizeof(int));
memset(m_histogram, 0, sizeof(m_histogram));
}
@@ -1201,8 +1201,8 @@ CostEstimate::~CostEstimate()
void CostEstimate::init(x265_param *_param, Frame *pic)
{
m_param = _param;
- m_widthInCU = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
- m_heightInCU = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
+ m_widthInCU = ((m_param->sourceWidth / X265_LOWRES_SCALE) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
+ m_heightInCU = ((m_param->sourceHeight / X265_LOWRES_SCALE) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
m_rows = new EstimateRow[m_heightInCU];
for (int i = 0; i < m_heightInCU; i++)
More information about the x265-devel
mailing list