[x265] [PATCH] lowres: Enhanced scaling
Steve Borho
steve at borho.org
Thu Jul 24 07:21:24 CEST 2014
On 07/22, Nicolas Morey-Chaisemartin wrote:
> # HG changeset patch
> # User Nicolas Morey-Chaisemartin <nmorey at kalray.eu>
> # Date 1406020650 -7200
> # Tue Jul 22 11:17:30 2014 +0200
> # Node ID fc75f5f4f85e0d9441dc73b09ec6aaaa0a36c20f
> # Parent 4c9ce4db74d1c9768abc61290bd1bda002b79f4e
> lowres: Enhanced scaling
>
> * Replace hard coded values with X265_LOWRES_CU_SIZE
> * Add X265_LOWRES_SCALE define to tweak divider for LowRes
>
> Note: X265_LOWRES_SCALE * X265_LOWRES_CU_SIZE must be lesser or equal to 64 to be able to use standard filters for weight prediction
>
> Performance Impact:
> Command Line:
> ./x265/build/x265 --preset medium --accel=none red_kayak_1080p-420.y4m kayak.hevc --bitrate=$BITRATE --ssim
>
> - BITRATE=4000 X265_LOWRES_SCALE=2
> encoded 570 frames in 105.56s (5.40 fps), 3334.27 kb/s, SSIM Mean Y: 0.8900527 ( 9.588 dB)
> - BITRATE=4000 X265_LOWRES_SCALE=4
> encoded 570 frames in 87.11s (6.54 fps), 3398.38 kb/s, SSIM Mean Y: 0.8836753 ( 9.343 dB)
> - BITRATE=4000 X265_LOWRES_SCALE=8
> encoded 570 frames in 79.71s (7.15 fps), 3437.19 kb/s, SSIM Mean Y: 0.8765783 ( 9.086 dB)
>
> - BITRATE=9000 X265_LOWRES_SCALE=2
> encoded 570 frames in 115.32s (4.94 fps), 7263.50 kb/s, SSIM Mean Y: 0.9272905 (11.384 dB)
> - BITRATE=9000 X265_LOWRES_SCALE=4
> encoded 570 frames in 101.53s (5.61 fps), 7439.24 kb/s, SSIM Mean Y: 0.9209998 (11.024 dB)
> - BITRATE=9000 X265_LOWRES_SCALE=8
> encoded 570 frames in 92.98s (6.13 fps), 7549.41 kb/s, SSIM Mean Y: 0.9160721 (10.761 dB)
Those are pretty significant drops in SSIM; something is likely
broken.
> diff --git a/source/common/common.h b/source/common/common.h
> --- a/source/common/common.h
> +++ b/source/common/common.h
> @@ -153,16 +153,17 @@ typedef int32_t coeff_t; // transf
> // arbitrary, but low because SATD scores are 1/4 normal
> #define X265_LOOKAHEAD_QP (12 + QP_BD_OFFSET)
> #define X265_LOOKAHEAD_MAX 250
> // Use the same size blocks as x264. Using larger blocks seems to give artificially
> // high cost estimates (intra and inter both suffer)
> #define X265_LOWRES_CU_SIZE 8
> #define X265_LOWRES_CU_BITS 3
> +#define X265_LOWRES_SCALE 2
> #define X265_MALLOC(type, count) (type*)x265_malloc(sizeof(type) * (count))
> #define X265_FREE(ptr) x265_free(ptr)
> #define CHECKED_MALLOC(var, type, count) \
> { \
> var = (type*)x265_malloc(sizeof(type) * (count)); \
> if (!var) \
> { \
> diff --git a/source/common/lowres.cpp b/source/common/lowres.cpp
> --- a/source/common/lowres.cpp
> +++ b/source/common/lowres.cpp
> @@ -24,20 +24,21 @@
> #include "TLibCommon/TComPicYuv.h"
> #include "lowres.h"
> #include "mv.h"
> using namespace x265;
> bool Lowres::create(TComPicYuv *orig, int _bframes, bool bAQEnabled)
> {
> + X265_CHECK(X265_LOWRES_SCALE * X265_LOWRES_CU_SIZE <= 64, "Invalid LowRes scaling\n");
> isLowres = true;
> bframes = _bframes;
> - width = orig->getWidth() / 2;
> - lines = orig->getHeight() / 2;
> + width = orig->getWidth() / X265_LOWRES_SCALE;
> + lines = orig->getHeight() / X265_LOWRES_SCALE;
> lumaStride = width + 2 * orig->getLumaMarginX();
> if (lumaStride & 31)
> lumaStride += 32 - (lumaStride & 31);
> int cuWidth = (width + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
> int cuHeight = (lines + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
> int cuCount = cuWidth * cuHeight;
> /* rounding the width to multiple of lowres CU size */
> diff --git a/source/encoder/frameencoder.cpp b/source/encoder/frameencoder.cpp
> --- a/source/encoder/frameencoder.cpp
> +++ b/source/encoder/frameencoder.cpp
> @@ -984,19 +984,20 @@ int FrameEncoder::calcQpForCu(uint32_t c
> if (bIsVbv)
> {
> m_frame->m_cuCostsForVbv[cuAddr] = 0;
> m_frame->m_intraCuCostsForVbv[cuAddr] = 0;
> }
> /* Derive qpOffet for each CU by averaging offsets for all 16x16 blocks in the cu. */
> double qp_offset = 0;
> - int maxBlockCols = (m_frame->getPicYuvOrg()->getWidth() + (16 - 1)) / 16;
> - int maxBlockRows = (m_frame->getPicYuvOrg()->getHeight() + (16 - 1)) / 16;
> - int noOfBlocks = g_maxCUSize / 16;
> + int lowResCu = (X265_LOWRES_CU_SIZE * X265_LOWRES_SCALE);
> + int maxBlockCols = (m_frame->getPicYuvOrg()->getWidth() + (lowResCu - 1)) / lowResCu;
> + int maxBlockRows = (m_frame->getPicYuvOrg()->getHeight() + (lowResCu - 1)) / lowResCu;
> + int noOfBlocks = g_maxCUSize / lowResCu;
> int block_y = (cuAddr / m_frame->getPicSym()->getFrameWidthInCU()) * noOfBlocks;
> int block_x = (cuAddr * noOfBlocks) - block_y * m_frame->getPicSym()->getFrameWidthInCU();
> /* Use cuTree offsets if cuTree enabled and frame is referenced, else use AQ offsets */
> double *qpoffs = (m_isReferenced && m_param->rc.cuTree) ? m_frame->m_lowres.qpCuTreeOffset : m_frame->m_lowres.qpAqOffset;
> int cnt = 0, idx = 0;
> for (int h = 0; h < noOfBlocks && block_y < maxBlockRows; h++, block_y++)
> diff --git a/source/encoder/ratecontrol.cpp b/source/encoder/ratecontrol.cpp
> --- a/source/encoder/ratecontrol.cpp
> +++ b/source/encoder/ratecontrol.cpp
> @@ -198,18 +198,18 @@ void RateControl::calcAdaptiveQuantFrame
> /* Calculate Qp offset for each 16x16 block in the frame */
> int block_xy = 0;
> int block_x = 0, block_y = 0;
> double strength = 0.f;
> if (m_param->rc.aqMode == X265_AQ_NONE || m_param->rc.aqStrength == 0)
> {
> /* Need to init it anyways for CU tree */
> - int cuWidth = ((maxCol / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
> - int cuHeight = ((maxRow / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
> + int cuWidth = ((maxCol / X265_LOWRES_SCALE) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
> + int cuHeight = ((maxRow / X265_LOWRES_SCALE) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
> int cuCount = cuWidth * cuHeight;
> if (m_param->rc.aqMode && m_param->rc.aqStrength == 0)
> {
> memset(pic->m_lowres.qpCuTreeOffset, 0, cuCount * sizeof(double));
> memset(pic->m_lowres.qpAqOffset, 0, cuCount * sizeof(double));
> for (int cuxy = 0; cuxy < cuCount; cuxy++)
> {
> @@ -231,19 +231,19 @@ void RateControl::calcAdaptiveQuantFrame
> }
> else
> {
> block_xy = 0;
> double avg_adj_pow2 = 0, avg_adj = 0, qp_adj = 0;
> if (m_param->rc.aqMode == X265_AQ_AUTO_VARIANCE)
> {
> double bit_depth_correction = pow(1 << (X265_DEPTH - 8), 0.5);
> - for (block_y = 0; block_y < maxRow; block_y += 16)
> + for (block_y = 0; block_y < maxRow; block_y += (X265_LOWRES_CU_SIZE * X265_LOWRES_SCALE))
> {
> - for (block_x = 0; block_x < maxCol; block_x += 16)
> + for (block_x = 0; block_x < maxCol; block_x += (X265_LOWRES_CU_SIZE * X265_LOWRES_SCALE))
> {
> uint32_t energy = acEnergyCu(pic, block_x, block_y);
> qp_adj = pow(energy + 1, 0.1);
> pic->m_lowres.qpCuTreeOffset[block_xy] = qp_adj;
> avg_adj += qp_adj;
> avg_adj_pow2 += qp_adj * qp_adj;
> block_xy++;
> }
> @@ -253,19 +253,19 @@ void RateControl::calcAdaptiveQuantFrame
> avg_adj_pow2 /= m_ncu;
> strength = m_param->rc.aqStrength * avg_adj / bit_depth_correction;
> avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - (11.f * bit_depth_correction)) / avg_adj;
> }
> else
> strength = m_param->rc.aqStrength * 1.0397f;
> block_xy = 0;
> - for (block_y = 0; block_y < maxRow; block_y += 16)
> + for (block_y = 0; block_y < maxRow; block_y += (X265_LOWRES_CU_SIZE * X265_LOWRES_SCALE))
> {
> - for (block_x = 0; block_x < maxCol; block_x += 16)
> + for (block_x = 0; block_x < maxCol; block_x += (X265_LOWRES_CU_SIZE * X265_LOWRES_SCALE))
> {
> if (m_param->rc.aqMode == X265_AQ_AUTO_VARIANCE)
> {
> qp_adj = pic->m_lowres.qpCuTreeOffset[block_xy];
> qp_adj = strength * (qp_adj - avg_adj);
> }
> else
> {
> @@ -297,18 +297,18 @@ void RateControl::calcAdaptiveQuantFrame
> pic->m_lowres.wp_ssd[i] = ssd - (sum * sum + (width[i] * height[i]) / 2) / (width[i] * height[i]);
> }
> }
> }
> RateControl::RateControl(x265_param *p)
> {
> m_param = p;
> - int lowresCuWidth = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
> - int lowresCuHeight = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
> + int lowresCuWidth = ((m_param->sourceWidth / X265_LOWRES_SCALE) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
> + int lowresCuHeight = ((m_param->sourceHeight / X265_LOWRES_SCALE) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
> m_ncu = lowresCuWidth * lowresCuHeight;
> if (m_param->rc.cuTree)
> m_qCompress = 1;
> else
> m_qCompress = m_param->rc.qCompress;
> // validate for param->rc, maybe it is need to add a function like x265_parameters_valiate()
> diff --git a/source/encoder/slicetype.cpp b/source/encoder/slicetype.cpp
> --- a/source/encoder/slicetype.cpp
> +++ b/source/encoder/slicetype.cpp
> @@ -61,18 +61,18 @@ Lookahead::Lookahead(x265_param *param,
> {
> m_bReady = 0;
> m_param = param;
> m_top = enc;
> m_lastKeyframe = -m_param->keyframeMax;
> m_lastNonB = NULL;
> m_bFilling = true;
> m_bFlushed = false;
> - m_widthInCU = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
> - m_heightInCU = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
> + m_widthInCU = ((m_param->sourceWidth / X265_LOWRES_SCALE) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
> + m_heightInCU = ((m_param->sourceHeight / X265_LOWRES_SCALE) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
> m_scratch = (int*)x265_malloc(m_widthInCU * sizeof(int));
> memset(m_histogram, 0, sizeof(m_histogram));
> }
> Lookahead::~Lookahead() { }
> void Lookahead::init()
> {
> @@ -1205,18 +1205,18 @@ CostEstimate::~CostEstimate()
> }
> delete[] m_rows;
> }
> void CostEstimate::init(x265_param *_param, Frame *pic)
> {
> m_param = _param;
> - m_widthInCU = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
> - m_heightInCU = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
> + m_widthInCU = ((m_param->sourceWidth / X265_LOWRES_SCALE) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
> + m_heightInCU = ((m_param->sourceHeight / X265_LOWRES_SCALE) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
> m_rows = new EstimateRow[m_heightInCU];
> for (int i = 0; i < m_heightInCU; i++)
> {
> m_rows[i].m_widthInCU = m_widthInCU;
> m_rows[i].m_heightInCU = m_heightInCU;
> }
> diff --git a/source/encoder/weightPrediction.cpp b/source/encoder/weightPrediction.cpp
> --- a/source/encoder/weightPrediction.cpp
> +++ b/source/encoder/weightPrediction.cpp
> @@ -52,39 +52,40 @@ int sliceHeaderCost(WeightParam *w, int
> return lambda * (10 + denomCost + 2 * (bs_size_se(w[0].inputWeight) + bs_size_se(w[0].inputOffset)));
> }
> /* make a motion compensated copy of lowres ref into mcout with the same stride.
> * The borders of mcout are not extended */
> void mcLuma(pixel* mcout, Lowres& ref, const MV * mvs)
> {
> int stride = ref.lumaStride;
> - const int cuSize = 8;
> + const int cuSize = X265_LOWRES_CU_SIZE;
> + const int partSize = partitionFromSizes(cuSize, cuSize);
> MV mvmin, mvmax;
> int cu = 0;
> for (int y = 0; y < ref.lines; y += cuSize)
> {
> int pixoff = y * stride;
> mvmin.y = (int16_t)((-y - 8) << 2);
> mvmax.y = (int16_t)((ref.lines - y - 1 + 8) << 2);
> for (int x = 0; x < ref.width; x += cuSize, pixoff += cuSize, cu++)
> {
> - ALIGN_VAR_16(pixel, buf8x8[8 * 8]);
> - intptr_t bstride = 8;
> + ALIGN_VAR_16(pixel, buf[cuSize * cuSize]);
> + intptr_t bstride = cuSize;
> mvmin.x = (int16_t)((-x - 8) << 2);
> mvmax.x = (int16_t)((ref.width - x - 1 + 8) << 2);
> /* clip MV to available pixels */
> MV mv = mvs[cu];
> mv = mv.clipped(mvmin, mvmax);
> - pixel *tmp = ref.lowresMC(pixoff, mv, buf8x8, bstride);
> - primitives.luma_copy_pp[LUMA_8x8](mcout + pixoff, stride, tmp, bstride);
> + pixel *tmp = ref.lowresMC(pixoff, mv, buf, bstride);
> + primitives.luma_copy_pp[partSize](mcout + pixoff, stride, tmp, bstride);
> }
> }
> x265_emms();
> }
> /* use lowres MVs from lookahead to generate a motion compensated chroma plane.
> * if a block had cheaper lowres cost as intra, we treat it as MV 0 */
> @@ -94,71 +95,73 @@ void mcChroma(pixel * mcout,
> const MV * mvs,
> const Cache& cache,
> int height,
> int width)
> {
> /* the motion vectors correspond to 8x8 lowres luma blocks, or 16x16 fullres
> * luma blocks. We have to adapt block size to chroma csp */
> int csp = cache.csp;
> - int bw = 16 >> cache.hshift;
> - int bh = 16 >> cache.vshift;
> + int cuSize = X265_LOWRES_CU_SIZE * X265_LOWRES_SCALE;
> + const int partSize = partitionFromSizes(cuSize, cuSize);
> + int bw = cuSize >> cache.hshift;
> + int bh = cuSize >> cache.vshift;
> MV mvmin, mvmax;
> for (int y = 0; y < height; y += bh)
> {
> /* note: lowres block count per row might be different from chroma block
> * count per row because of rounding issues, so be very careful with indexing
> * into the lowres structures */
> int cu = y * cache.lowresWidthInCU;
> int pixoff = y * stride;
> mvmin.y = (int16_t)((-y - 8) << 2);
> mvmax.y = (int16_t)((height - y - 1 + 8) << 2);
> for (int x = 0; x < width; x += bw, cu++, pixoff += bw)
> {
> if (x < cache.lowresWidthInCU && y < cache.lowresHeightInCU)
> {
> - MV mv = mvs[cu]; // lowres MV
> - mv <<= 1; // fullres MV
> + MV mv = mvs[cu]; // lowres MV
> + mv *= X265_LOWRES_SCALE; // fullres MV
> mv.x >>= cache.hshift;
> mv.y >>= cache.vshift;
> /* clip MV to available pixels */
> mvmin.x = (int16_t)((-x - 8) << 2);
> mvmax.x = (int16_t)((width - x - 1 + 8) << 2);
> mv = mv.clipped(mvmin, mvmax);
> int fpeloffset = (mv.y >> 2) * stride + (mv.x >> 2);
> pixel *temp = src + pixoff + fpeloffset;
> int xFrac = mv.x & 0x7;
> int yFrac = mv.y & 0x7;
> if ((yFrac | xFrac) == 0)
> {
> - primitives.chroma[csp].copy_pp[LUMA_16x16](mcout + pixoff, stride, temp, stride);
> + primitives.chroma[csp].copy_pp[partSize](mcout + pixoff, stride, temp, stride);
> }
> else if (yFrac == 0)
> {
> - primitives.chroma[csp].filter_hpp[LUMA_16x16](temp, stride, mcout + pixoff, stride, xFrac);
> + primitives.chroma[csp].filter_hpp[partSize](temp, stride, mcout + pixoff, stride, xFrac);
> }
> else if (xFrac == 0)
> {
> - primitives.chroma[csp].filter_vpp[LUMA_16x16](temp, stride, mcout + pixoff, stride, yFrac);
> + primitives.chroma[csp].filter_vpp[partSize](temp, stride, mcout + pixoff, stride, yFrac);
> }
> else
> {
> - ALIGN_VAR_16(int16_t, imm[16 * (16 + NTAPS_CHROMA)]);
> - primitives.chroma[csp].filter_hps[LUMA_16x16](temp, stride, imm, bw, xFrac, 1);
> - primitives.chroma[csp].filter_vsp[LUMA_16x16](imm + ((NTAPS_CHROMA >> 1) - 1) * bw, bw, mcout + pixoff, stride, yFrac);
> + ALIGN_VAR_16(int16_t, imm[cuSize * (cuSize + NTAPS_CHROMA)]);
> + primitives.chroma[csp].filter_hps[partSize](temp, stride, imm, bw, xFrac, 1);
> + primitives.chroma[csp].filter_vsp[partSize](imm + ((NTAPS_CHROMA >> 1) - 1) * bw, bw, mcout + pixoff, stride, yFrac);
> }
> }
> else
> {
> - primitives.chroma[csp].copy_pp[LUMA_16x16](mcout + pixoff, stride, src + pixoff, stride);
> + primitives.chroma[csp].copy_pp[partSize](mcout + pixoff, stride, src + pixoff, stride);
> }
> }
> }
> x265_emms();
> }
> /* Measure sum of 8x8 satd costs between source frame and reference
> @@ -191,21 +194,24 @@ uint32_t weightCost(pixel * fenc
> }
> uint32_t cost = 0;
> pixel *f = fenc, *r = ref;
> if (bLuma)
> {
> int cu = 0;
> - for (int y = 8; y < height; y += 8, r += 8 * stride, f += 8 * stride)
> + int cuSize = X265_LOWRES_CU_SIZE;
I think you're missing a scale here
> + int partSize = partitionFromSizes(cuSize, cuSize);
> +
> + for (int y = cuSize; y < height; y += cuSize, r += cuSize * stride, f += cuSize * stride)
> {
> - for (int x = 8; x < width; x += 8, cu++)
> + for (int x = cuSize; x < width; x += cuSize, cu++)
> {
> - int cmp = primitives.satd[LUMA_8x8](r + x, stride, f + x, stride);
> + int cmp = primitives.satd[partSize](r + x, stride, f + x, stride);
> cost += X265_MIN(cmp, cache.intraCost[cu]);
> }
> }
> }
> else if (cache.csp == X265_CSP_I444)
> for (int y = 16; y < height; y += 16, r += 16 * stride, f += 16 * stride)
> {
> for (int x = 16; x < width; x += 16)
> @@ -235,18 +241,18 @@ void weightAnalyse(Slice& slice, x265_pa
> TComPicYuv *fencYuv = slice.m_pic->getPicYuvOrg();
> Lowres& fenc = slice.m_pic->m_lowres;
> Cache cache;
> memset(&cache, 0, sizeof(cache));
> cache.intraCost = fenc.intraCost;
> cache.numPredDir = slice.isInterP() ? 1 : 2;
> - cache.lowresWidthInCU = fenc.width >> 3;
> - cache.lowresHeightInCU = fenc.lines >> 3;
> + cache.lowresWidthInCU = fenc.width >> X265_LOWRES_CU_BITS;
> + cache.lowresHeightInCU = fenc.lines >> X265_LOWRES_CU_BITS;
and I didn't see any place that actually did the multiple downscales
--
Steve Borho
More information about the x265-devel
mailing list