[x265] [PATCH] lowres: Enhanced scaling
Nicolas Morey-Chaisemartin
nmorey at kalray.eu
Tue Jul 22 11:55:11 CEST 2014
# HG changeset patch
# User Nicolas Morey-Chaisemartin <nmorey at kalray.eu>
# Date 1406020650 -7200
# Tue Jul 22 11:17:30 2014 +0200
# Node ID fc75f5f4f85e0d9441dc73b09ec6aaaa0a36c20f
# Parent 4c9ce4db74d1c9768abc61290bd1bda002b79f4e
lowres: Enhanced scaling
* Replace hard coded values with X265_LOWRES_CU_SIZE
* Add X265_LOWRES_SCALE define to tweak divider for LowRes
Note: X265_LOWRES_SCALE * X265_LOWRES_CU_SIZE must be lesser or equal to 64 to be able to use standard filters for weight prediction
Performance Impact:
Command Line:
./x265/build/x265 --preset medium --accel=none red_kayak_1080p-420.y4m kayak.hevc --bitrate=$BITRATE --ssim
- BITRATE=4000 X265_LOWRES_SCALE=2
encoded 570 frames in 105.56s (5.40 fps), 3334.27 kb/s, SSIM Mean Y: 0.8900527 ( 9.588 dB)
- BITRATE=4000 X265_LOWRES_SCALE=4
encoded 570 frames in 87.11s (6.54 fps), 3398.38 kb/s, SSIM Mean Y: 0.8836753 ( 9.343 dB)
- BITRATE=4000 X265_LOWRES_SCALE=8
encoded 570 frames in 79.71s (7.15 fps), 3437.19 kb/s, SSIM Mean Y: 0.8765783 ( 9.086 dB)
- BITRATE=9000 X265_LOWRES_SCALE=2
encoded 570 frames in 115.32s (4.94 fps), 7263.50 kb/s, SSIM Mean Y: 0.9272905 (11.384 dB)
- BITRATE=9000 X265_LOWRES_SCALE=4
encoded 570 frames in 101.53s (5.61 fps), 7439.24 kb/s, SSIM Mean Y: 0.9209998 (11.024 dB)
- BITRATE=9000 X265_LOWRES_SCALE=8
encoded 570 frames in 92.98s (6.13 fps), 7549.41 kb/s, SSIM Mean Y: 0.9160721 (10.761 dB)
diff --git a/source/common/common.h b/source/common/common.h
--- a/source/common/common.h
+++ b/source/common/common.h
@@ -153,16 +153,17 @@ typedef int32_t coeff_t; // transf
// arbitrary, but low because SATD scores are 1/4 normal
#define X265_LOOKAHEAD_QP (12 + QP_BD_OFFSET)
#define X265_LOOKAHEAD_MAX 250
// Use the same size blocks as x264. Using larger blocks seems to give artificially
// high cost estimates (intra and inter both suffer)
#define X265_LOWRES_CU_SIZE 8
#define X265_LOWRES_CU_BITS 3
+#define X265_LOWRES_SCALE 2
#define X265_MALLOC(type, count) (type*)x265_malloc(sizeof(type) * (count))
#define X265_FREE(ptr) x265_free(ptr)
#define CHECKED_MALLOC(var, type, count) \
{ \
var = (type*)x265_malloc(sizeof(type) * (count)); \
if (!var) \
{ \
diff --git a/source/common/lowres.cpp b/source/common/lowres.cpp
--- a/source/common/lowres.cpp
+++ b/source/common/lowres.cpp
@@ -24,20 +24,21 @@
#include "TLibCommon/TComPicYuv.h"
#include "lowres.h"
#include "mv.h"
using namespace x265;
bool Lowres::create(TComPicYuv *orig, int _bframes, bool bAQEnabled)
{
+ X265_CHECK(X265_LOWRES_SCALE * X265_LOWRES_CU_SIZE <= 64, "Invalid LowRes scaling\n");
isLowres = true;
bframes = _bframes;
- width = orig->getWidth() / 2;
- lines = orig->getHeight() / 2;
+ width = orig->getWidth() / X265_LOWRES_SCALE;
+ lines = orig->getHeight() / X265_LOWRES_SCALE;
lumaStride = width + 2 * orig->getLumaMarginX();
if (lumaStride & 31)
lumaStride += 32 - (lumaStride & 31);
int cuWidth = (width + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
int cuHeight = (lines + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
int cuCount = cuWidth * cuHeight;
/* rounding the width to multiple of lowres CU size */
diff --git a/source/encoder/frameencoder.cpp b/source/encoder/frameencoder.cpp
--- a/source/encoder/frameencoder.cpp
+++ b/source/encoder/frameencoder.cpp
@@ -984,19 +984,20 @@ int FrameEncoder::calcQpForCu(uint32_t c
if (bIsVbv)
{
m_frame->m_cuCostsForVbv[cuAddr] = 0;
m_frame->m_intraCuCostsForVbv[cuAddr] = 0;
}
/* Derive qpOffet for each CU by averaging offsets for all 16x16 blocks in the cu. */
double qp_offset = 0;
- int maxBlockCols = (m_frame->getPicYuvOrg()->getWidth() + (16 - 1)) / 16;
- int maxBlockRows = (m_frame->getPicYuvOrg()->getHeight() + (16 - 1)) / 16;
- int noOfBlocks = g_maxCUSize / 16;
+ int lowResCu = (X265_LOWRES_CU_SIZE * X265_LOWRES_SCALE);
+ int maxBlockCols = (m_frame->getPicYuvOrg()->getWidth() + (lowResCu - 1)) / lowResCu;
+ int maxBlockRows = (m_frame->getPicYuvOrg()->getHeight() + (lowResCu - 1)) / lowResCu;
+ int noOfBlocks = g_maxCUSize / lowResCu;
int block_y = (cuAddr / m_frame->getPicSym()->getFrameWidthInCU()) * noOfBlocks;
int block_x = (cuAddr * noOfBlocks) - block_y * m_frame->getPicSym()->getFrameWidthInCU();
/* Use cuTree offsets if cuTree enabled and frame is referenced, else use AQ offsets */
double *qpoffs = (m_isReferenced && m_param->rc.cuTree) ? m_frame->m_lowres.qpCuTreeOffset : m_frame->m_lowres.qpAqOffset;
int cnt = 0, idx = 0;
for (int h = 0; h < noOfBlocks && block_y < maxBlockRows; h++, block_y++)
diff --git a/source/encoder/ratecontrol.cpp b/source/encoder/ratecontrol.cpp
--- a/source/encoder/ratecontrol.cpp
+++ b/source/encoder/ratecontrol.cpp
@@ -198,18 +198,18 @@ void RateControl::calcAdaptiveQuantFrame
/* Calculate Qp offset for each 16x16 block in the frame */
int block_xy = 0;
int block_x = 0, block_y = 0;
double strength = 0.f;
if (m_param->rc.aqMode == X265_AQ_NONE || m_param->rc.aqStrength == 0)
{
/* Need to init it anyways for CU tree */
- int cuWidth = ((maxCol / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
- int cuHeight = ((maxRow / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
+ int cuWidth = ((maxCol / X265_LOWRES_SCALE) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
+ int cuHeight = ((maxRow / X265_LOWRES_SCALE) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
int cuCount = cuWidth * cuHeight;
if (m_param->rc.aqMode && m_param->rc.aqStrength == 0)
{
memset(pic->m_lowres.qpCuTreeOffset, 0, cuCount * sizeof(double));
memset(pic->m_lowres.qpAqOffset, 0, cuCount * sizeof(double));
for (int cuxy = 0; cuxy < cuCount; cuxy++)
{
@@ -231,19 +231,19 @@ void RateControl::calcAdaptiveQuantFrame
}
else
{
block_xy = 0;
double avg_adj_pow2 = 0, avg_adj = 0, qp_adj = 0;
if (m_param->rc.aqMode == X265_AQ_AUTO_VARIANCE)
{
double bit_depth_correction = pow(1 << (X265_DEPTH - 8), 0.5);
- for (block_y = 0; block_y < maxRow; block_y += 16)
+ for (block_y = 0; block_y < maxRow; block_y += (X265_LOWRES_CU_SIZE * X265_LOWRES_SCALE))
{
- for (block_x = 0; block_x < maxCol; block_x += 16)
+ for (block_x = 0; block_x < maxCol; block_x += (X265_LOWRES_CU_SIZE * X265_LOWRES_SCALE))
{
uint32_t energy = acEnergyCu(pic, block_x, block_y);
qp_adj = pow(energy + 1, 0.1);
pic->m_lowres.qpCuTreeOffset[block_xy] = qp_adj;
avg_adj += qp_adj;
avg_adj_pow2 += qp_adj * qp_adj;
block_xy++;
}
@@ -253,19 +253,19 @@ void RateControl::calcAdaptiveQuantFrame
avg_adj_pow2 /= m_ncu;
strength = m_param->rc.aqStrength * avg_adj / bit_depth_correction;
avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - (11.f * bit_depth_correction)) / avg_adj;
}
else
strength = m_param->rc.aqStrength * 1.0397f;
block_xy = 0;
- for (block_y = 0; block_y < maxRow; block_y += 16)
+ for (block_y = 0; block_y < maxRow; block_y += (X265_LOWRES_CU_SIZE * X265_LOWRES_SCALE))
{
- for (block_x = 0; block_x < maxCol; block_x += 16)
+ for (block_x = 0; block_x < maxCol; block_x += (X265_LOWRES_CU_SIZE * X265_LOWRES_SCALE))
{
if (m_param->rc.aqMode == X265_AQ_AUTO_VARIANCE)
{
qp_adj = pic->m_lowres.qpCuTreeOffset[block_xy];
qp_adj = strength * (qp_adj - avg_adj);
}
else
{
@@ -297,18 +297,18 @@ void RateControl::calcAdaptiveQuantFrame
pic->m_lowres.wp_ssd[i] = ssd - (sum * sum + (width[i] * height[i]) / 2) / (width[i] * height[i]);
}
}
}
RateControl::RateControl(x265_param *p)
{
m_param = p;
- int lowresCuWidth = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
- int lowresCuHeight = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
+ int lowresCuWidth = ((m_param->sourceWidth / X265_LOWRES_SCALE) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
+ int lowresCuHeight = ((m_param->sourceHeight / X265_LOWRES_SCALE) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
m_ncu = lowresCuWidth * lowresCuHeight;
if (m_param->rc.cuTree)
m_qCompress = 1;
else
m_qCompress = m_param->rc.qCompress;
// validate for param->rc, maybe it is need to add a function like x265_parameters_valiate()
diff --git a/source/encoder/slicetype.cpp b/source/encoder/slicetype.cpp
--- a/source/encoder/slicetype.cpp
+++ b/source/encoder/slicetype.cpp
@@ -61,18 +61,18 @@ Lookahead::Lookahead(x265_param *param,
{
m_bReady = 0;
m_param = param;
m_top = enc;
m_lastKeyframe = -m_param->keyframeMax;
m_lastNonB = NULL;
m_bFilling = true;
m_bFlushed = false;
- m_widthInCU = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
- m_heightInCU = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
+ m_widthInCU = ((m_param->sourceWidth / X265_LOWRES_SCALE) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
+ m_heightInCU = ((m_param->sourceHeight / X265_LOWRES_SCALE) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
m_scratch = (int*)x265_malloc(m_widthInCU * sizeof(int));
memset(m_histogram, 0, sizeof(m_histogram));
}
Lookahead::~Lookahead() { }
void Lookahead::init()
{
@@ -1205,18 +1205,18 @@ CostEstimate::~CostEstimate()
}
delete[] m_rows;
}
void CostEstimate::init(x265_param *_param, Frame *pic)
{
m_param = _param;
- m_widthInCU = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
- m_heightInCU = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
+ m_widthInCU = ((m_param->sourceWidth / X265_LOWRES_SCALE) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
+ m_heightInCU = ((m_param->sourceHeight / X265_LOWRES_SCALE) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
m_rows = new EstimateRow[m_heightInCU];
for (int i = 0; i < m_heightInCU; i++)
{
m_rows[i].m_widthInCU = m_widthInCU;
m_rows[i].m_heightInCU = m_heightInCU;
}
diff --git a/source/encoder/weightPrediction.cpp b/source/encoder/weightPrediction.cpp
--- a/source/encoder/weightPrediction.cpp
+++ b/source/encoder/weightPrediction.cpp
@@ -52,39 +52,40 @@ int sliceHeaderCost(WeightParam *w, int
return lambda * (10 + denomCost + 2 * (bs_size_se(w[0].inputWeight) + bs_size_se(w[0].inputOffset)));
}
/* make a motion compensated copy of lowres ref into mcout with the same stride.
* The borders of mcout are not extended */
void mcLuma(pixel* mcout, Lowres& ref, const MV * mvs)
{
int stride = ref.lumaStride;
- const int cuSize = 8;
+ const int cuSize = X265_LOWRES_CU_SIZE;
+ const int partSize = partitionFromSizes(cuSize, cuSize);
MV mvmin, mvmax;
int cu = 0;
for (int y = 0; y < ref.lines; y += cuSize)
{
int pixoff = y * stride;
mvmin.y = (int16_t)((-y - 8) << 2);
mvmax.y = (int16_t)((ref.lines - y - 1 + 8) << 2);
for (int x = 0; x < ref.width; x += cuSize, pixoff += cuSize, cu++)
{
- ALIGN_VAR_16(pixel, buf8x8[8 * 8]);
- intptr_t bstride = 8;
+ ALIGN_VAR_16(pixel, buf[cuSize * cuSize]);
+ intptr_t bstride = cuSize;
mvmin.x = (int16_t)((-x - 8) << 2);
mvmax.x = (int16_t)((ref.width - x - 1 + 8) << 2);
/* clip MV to available pixels */
MV mv = mvs[cu];
mv = mv.clipped(mvmin, mvmax);
- pixel *tmp = ref.lowresMC(pixoff, mv, buf8x8, bstride);
- primitives.luma_copy_pp[LUMA_8x8](mcout + pixoff, stride, tmp, bstride);
+ pixel *tmp = ref.lowresMC(pixoff, mv, buf, bstride);
+ primitives.luma_copy_pp[partSize](mcout + pixoff, stride, tmp, bstride);
}
}
x265_emms();
}
/* use lowres MVs from lookahead to generate a motion compensated chroma plane.
* if a block had cheaper lowres cost as intra, we treat it as MV 0 */
@@ -94,71 +95,73 @@ void mcChroma(pixel * mcout,
const MV * mvs,
const Cache& cache,
int height,
int width)
{
/* the motion vectors correspond to 8x8 lowres luma blocks, or 16x16 fullres
* luma blocks. We have to adapt block size to chroma csp */
int csp = cache.csp;
- int bw = 16 >> cache.hshift;
- int bh = 16 >> cache.vshift;
+ int cuSize = X265_LOWRES_CU_SIZE * X265_LOWRES_SCALE;
+ const int partSize = partitionFromSizes(cuSize, cuSize);
+ int bw = cuSize >> cache.hshift;
+ int bh = cuSize >> cache.vshift;
MV mvmin, mvmax;
for (int y = 0; y < height; y += bh)
{
/* note: lowres block count per row might be different from chroma block
* count per row because of rounding issues, so be very careful with indexing
* into the lowres structures */
int cu = y * cache.lowresWidthInCU;
int pixoff = y * stride;
mvmin.y = (int16_t)((-y - 8) << 2);
mvmax.y = (int16_t)((height - y - 1 + 8) << 2);
for (int x = 0; x < width; x += bw, cu++, pixoff += bw)
{
if (x < cache.lowresWidthInCU && y < cache.lowresHeightInCU)
{
- MV mv = mvs[cu]; // lowres MV
- mv <<= 1; // fullres MV
+ MV mv = mvs[cu]; // lowres MV
+ mv *= X265_LOWRES_SCALE; // fullres MV
mv.x >>= cache.hshift;
mv.y >>= cache.vshift;
/* clip MV to available pixels */
mvmin.x = (int16_t)((-x - 8) << 2);
mvmax.x = (int16_t)((width - x - 1 + 8) << 2);
mv = mv.clipped(mvmin, mvmax);
int fpeloffset = (mv.y >> 2) * stride + (mv.x >> 2);
pixel *temp = src + pixoff + fpeloffset;
int xFrac = mv.x & 0x7;
int yFrac = mv.y & 0x7;
if ((yFrac | xFrac) == 0)
{
- primitives.chroma[csp].copy_pp[LUMA_16x16](mcout + pixoff, stride, temp, stride);
+ primitives.chroma[csp].copy_pp[partSize](mcout + pixoff, stride, temp, stride);
}
else if (yFrac == 0)
{
- primitives.chroma[csp].filter_hpp[LUMA_16x16](temp, stride, mcout + pixoff, stride, xFrac);
+ primitives.chroma[csp].filter_hpp[partSize](temp, stride, mcout + pixoff, stride, xFrac);
}
else if (xFrac == 0)
{
- primitives.chroma[csp].filter_vpp[LUMA_16x16](temp, stride, mcout + pixoff, stride, yFrac);
+ primitives.chroma[csp].filter_vpp[partSize](temp, stride, mcout + pixoff, stride, yFrac);
}
else
{
- ALIGN_VAR_16(int16_t, imm[16 * (16 + NTAPS_CHROMA)]);
- primitives.chroma[csp].filter_hps[LUMA_16x16](temp, stride, imm, bw, xFrac, 1);
- primitives.chroma[csp].filter_vsp[LUMA_16x16](imm + ((NTAPS_CHROMA >> 1) - 1) * bw, bw, mcout + pixoff, stride, yFrac);
+ ALIGN_VAR_16(int16_t, imm[cuSize * (cuSize + NTAPS_CHROMA)]);
+ primitives.chroma[csp].filter_hps[partSize](temp, stride, imm, bw, xFrac, 1);
+ primitives.chroma[csp].filter_vsp[partSize](imm + ((NTAPS_CHROMA >> 1) - 1) * bw, bw, mcout + pixoff, stride, yFrac);
}
}
else
{
- primitives.chroma[csp].copy_pp[LUMA_16x16](mcout + pixoff, stride, src + pixoff, stride);
+ primitives.chroma[csp].copy_pp[partSize](mcout + pixoff, stride, src + pixoff, stride);
}
}
}
x265_emms();
}
/* Measure sum of 8x8 satd costs between source frame and reference
@@ -191,21 +194,24 @@ uint32_t weightCost(pixel * fenc
}
uint32_t cost = 0;
pixel *f = fenc, *r = ref;
if (bLuma)
{
int cu = 0;
- for (int y = 8; y < height; y += 8, r += 8 * stride, f += 8 * stride)
+ int cuSize = X265_LOWRES_CU_SIZE;
+ int partSize = partitionFromSizes(cuSize, cuSize);
+
+ for (int y = cuSize; y < height; y += cuSize, r += cuSize * stride, f += cuSize * stride)
{
- for (int x = 8; x < width; x += 8, cu++)
+ for (int x = cuSize; x < width; x += cuSize, cu++)
{
- int cmp = primitives.satd[LUMA_8x8](r + x, stride, f + x, stride);
+ int cmp = primitives.satd[partSize](r + x, stride, f + x, stride);
cost += X265_MIN(cmp, cache.intraCost[cu]);
}
}
}
else if (cache.csp == X265_CSP_I444)
for (int y = 16; y < height; y += 16, r += 16 * stride, f += 16 * stride)
{
for (int x = 16; x < width; x += 16)
@@ -235,18 +241,18 @@ void weightAnalyse(Slice& slice, x265_pa
TComPicYuv *fencYuv = slice.m_pic->getPicYuvOrg();
Lowres& fenc = slice.m_pic->m_lowres;
Cache cache;
memset(&cache, 0, sizeof(cache));
cache.intraCost = fenc.intraCost;
cache.numPredDir = slice.isInterP() ? 1 : 2;
- cache.lowresWidthInCU = fenc.width >> 3;
- cache.lowresHeightInCU = fenc.lines >> 3;
+ cache.lowresWidthInCU = fenc.width >> X265_LOWRES_CU_BITS;
+ cache.lowresHeightInCU = fenc.lines >> X265_LOWRES_CU_BITS;
cache.csp = fencYuv->m_picCsp;
cache.hshift = CHROMA_H_SHIFT(cache.csp);
cache.vshift = CHROMA_V_SHIFT(cache.csp);
/* Use single allocation for motion compensated ref and weight buffers */
pixel *mcbuf = X265_MALLOC(pixel, 2 * fencYuv->getStride() * fencYuv->getHeight());
if (!mcbuf)
{
--
Nicolas Morey Chaisemartin
Phone : +33 6 42 46 68 87
More information about the x265-devel
mailing list