[x265] [PATCH] lowres: Enhanced scaling

Thu Jul 24 07:21:24 CEST 2014

On 07/22, Nicolas Morey-Chaisemartin wrote:
> # HG changeset patch
> # User Nicolas Morey-Chaisemartin <nmorey at kalray.eu>
> # Date 1406020650 -7200
> #      Tue Jul 22 11:17:30 2014 +0200
> # Node ID fc75f5f4f85e0d9441dc73b09ec6aaaa0a36c20f
> # Parent  4c9ce4db74d1c9768abc61290bd1bda002b79f4e
> lowres: Enhanced scaling
> 
>  * Replace hard coded values with X265_LOWRES_CU_SIZE
>  * Add X265_LOWRES_SCALE define to tweak divider for LowRes
> 
> Note: X265_LOWRES_SCALE * X265_LOWRES_CU_SIZE must be lesser or equal to 64 to be able to use standard filters for weight prediction
> 
> Performance Impact:
> Command Line:
> ./x265/build/x265     --preset medium  --accel=none  red_kayak_1080p-420.y4m kayak.hevc --bitrate=$BITRATE --ssim
> 
> - BITRATE=4000 X265_LOWRES_SCALE=2
> encoded 570 frames in 105.56s (5.40 fps), 3334.27 kb/s, SSIM Mean Y: 0.8900527 ( 9.588 dB)
> - BITRATE=4000 X265_LOWRES_SCALE=4
> encoded 570 frames in 87.11s (6.54 fps), 3398.38 kb/s, SSIM Mean Y: 0.8836753 ( 9.343 dB)
> - BITRATE=4000 X265_LOWRES_SCALE=8
> encoded 570 frames in 79.71s (7.15 fps), 3437.19 kb/s, SSIM Mean Y: 0.8765783 ( 9.086 dB)
> 
> - BITRATE=9000 X265_LOWRES_SCALE=2
> encoded 570 frames in 115.32s (4.94 fps), 7263.50 kb/s, SSIM Mean Y: 0.9272905 (11.384 dB)
> - BITRATE=9000 X265_LOWRES_SCALE=4
> encoded 570 frames in 101.53s (5.61 fps), 7439.24 kb/s, SSIM Mean Y: 0.9209998 (11.024 dB)
> - BITRATE=9000 X265_LOWRES_SCALE=8
> encoded 570 frames in 92.98s (6.13 fps), 7549.41 kb/s, SSIM Mean Y: 0.9160721 (10.761 dB)

Those are pretty significant drops in SSIM; something is likely
broken.

> diff --git a/source/common/common.h b/source/common/common.h
> --- a/source/common/common.h
> +++ b/source/common/common.h
> @@ -153,16 +153,17 @@ typedef int32_t  coeff_t;      // transf
>  // arbitrary, but low because SATD scores are 1/4 normal
>  #define X265_LOOKAHEAD_QP (12 + QP_BD_OFFSET)
>  #define X265_LOOKAHEAD_MAX 250
>  // Use the same size blocks as x264.  Using larger blocks seems to give artificially
>  // high cost estimates (intra and inter both suffer)
>  #define X265_LOWRES_CU_SIZE   8
>  #define X265_LOWRES_CU_BITS   3
> +#define X265_LOWRES_SCALE     2
>  #define X265_MALLOC(type, count)    (type*)x265_malloc(sizeof(type) * (count))
>  #define X265_FREE(ptr)              x265_free(ptr)
>  #define CHECKED_MALLOC(var, type, count) \
>      { \
>          var = (type*)x265_malloc(sizeof(type) * (count)); \
>          if (!var) \
>          { \
> diff --git a/source/common/lowres.cpp b/source/common/lowres.cpp
> --- a/source/common/lowres.cpp
> +++ b/source/common/lowres.cpp
> @@ -24,20 +24,21 @@
>  #include "TLibCommon/TComPicYuv.h"
>  #include "lowres.h"
>  #include "mv.h"
>  using namespace x265;
>  bool Lowres::create(TComPicYuv *orig, int _bframes, bool bAQEnabled)
>  {
> +	X265_CHECK(X265_LOWRES_SCALE * X265_LOWRES_CU_SIZE <= 64, "Invalid LowRes scaling\n");
>      isLowres = true;
>      bframes = _bframes;
> -    width = orig->getWidth() / 2;
> -    lines = orig->getHeight() / 2;
> +    width = orig->getWidth() / X265_LOWRES_SCALE;
> +    lines = orig->getHeight() / X265_LOWRES_SCALE;
>      lumaStride = width + 2 * orig->getLumaMarginX();
>      if (lumaStride & 31)
>          lumaStride += 32 - (lumaStride & 31);
>      int cuWidth = (width + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
>      int cuHeight = (lines + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
>      int cuCount = cuWidth * cuHeight;
>      /* rounding the width to multiple of lowres CU size */
> diff --git a/source/encoder/frameencoder.cpp b/source/encoder/frameencoder.cpp
> --- a/source/encoder/frameencoder.cpp
> +++ b/source/encoder/frameencoder.cpp
> @@ -984,19 +984,20 @@ int FrameEncoder::calcQpForCu(uint32_t c
>      if (bIsVbv)
>      {
>          m_frame->m_cuCostsForVbv[cuAddr] = 0;
>          m_frame->m_intraCuCostsForVbv[cuAddr] = 0;
>      }
>      /* Derive qpOffet for each CU by averaging offsets for all 16x16 blocks in the cu. */
>      double qp_offset = 0;
> -    int maxBlockCols = (m_frame->getPicYuvOrg()->getWidth() + (16 - 1)) / 16;
> -    int maxBlockRows = (m_frame->getPicYuvOrg()->getHeight() + (16 - 1)) / 16;
> -    int noOfBlocks = g_maxCUSize / 16;
> +    int lowResCu = (X265_LOWRES_CU_SIZE * X265_LOWRES_SCALE);
> +    int maxBlockCols = (m_frame->getPicYuvOrg()->getWidth() + (lowResCu - 1)) / lowResCu;
> +    int maxBlockRows = (m_frame->getPicYuvOrg()->getHeight() + (lowResCu - 1)) / lowResCu;
> +    int noOfBlocks = g_maxCUSize / lowResCu;
>      int block_y = (cuAddr / m_frame->getPicSym()->getFrameWidthInCU()) * noOfBlocks;
>      int block_x = (cuAddr * noOfBlocks) - block_y * m_frame->getPicSym()->getFrameWidthInCU();
>      /* Use cuTree offsets if cuTree enabled and frame is referenced, else use AQ offsets */
>      double *qpoffs = (m_isReferenced && m_param->rc.cuTree) ? m_frame->m_lowres.qpCuTreeOffset : m_frame->m_lowres.qpAqOffset;
>      int cnt = 0, idx = 0;
>      for (int h = 0; h < noOfBlocks && block_y < maxBlockRows; h++, block_y++)
> diff --git a/source/encoder/ratecontrol.cpp b/source/encoder/ratecontrol.cpp
> --- a/source/encoder/ratecontrol.cpp
> +++ b/source/encoder/ratecontrol.cpp
> @@ -198,18 +198,18 @@ void RateControl::calcAdaptiveQuantFrame
>      /* Calculate Qp offset for each 16x16 block in the frame */
>      int block_xy = 0;
>      int block_x = 0, block_y = 0;
>      double strength = 0.f;
>      if (m_param->rc.aqMode == X265_AQ_NONE || m_param->rc.aqStrength == 0)
>      {
>          /* Need to init it anyways for CU tree */
> -        int cuWidth = ((maxCol / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
> -        int cuHeight = ((maxRow / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
> +        int cuWidth = ((maxCol / X265_LOWRES_SCALE) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
> +        int cuHeight = ((maxRow / X265_LOWRES_SCALE) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
>          int cuCount = cuWidth * cuHeight;
>          if (m_param->rc.aqMode && m_param->rc.aqStrength == 0)
>          {
>              memset(pic->m_lowres.qpCuTreeOffset, 0, cuCount * sizeof(double));
>              memset(pic->m_lowres.qpAqOffset, 0, cuCount * sizeof(double));
>              for (int cuxy = 0; cuxy < cuCount; cuxy++)
>              {
> @@ -231,19 +231,19 @@ void RateControl::calcAdaptiveQuantFrame
>      }
>      else
>      {
>          block_xy = 0;
>          double avg_adj_pow2 = 0, avg_adj = 0, qp_adj = 0;
>          if (m_param->rc.aqMode == X265_AQ_AUTO_VARIANCE)
>          {
>              double bit_depth_correction = pow(1 << (X265_DEPTH - 8), 0.5);
> -            for (block_y = 0; block_y < maxRow; block_y += 16)
> +            for (block_y = 0; block_y < maxRow; block_y += (X265_LOWRES_CU_SIZE * X265_LOWRES_SCALE))
>              {
> -                for (block_x = 0; block_x < maxCol; block_x += 16)
> +                for (block_x = 0; block_x < maxCol; block_x += (X265_LOWRES_CU_SIZE * X265_LOWRES_SCALE))
>                  {
>                      uint32_t energy = acEnergyCu(pic, block_x, block_y);
>                      qp_adj = pow(energy + 1, 0.1);
>                      pic->m_lowres.qpCuTreeOffset[block_xy] = qp_adj;
>                      avg_adj += qp_adj;
>                      avg_adj_pow2 += qp_adj * qp_adj;
>                      block_xy++;
>                  }
> @@ -253,19 +253,19 @@ void RateControl::calcAdaptiveQuantFrame
>              avg_adj_pow2 /= m_ncu;
>              strength = m_param->rc.aqStrength * avg_adj / bit_depth_correction;
>              avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - (11.f * bit_depth_correction)) / avg_adj;
>          }
>          else
>              strength = m_param->rc.aqStrength * 1.0397f;
>          block_xy = 0;
> -        for (block_y = 0; block_y < maxRow; block_y += 16)
> +        for (block_y = 0; block_y < maxRow; block_y += (X265_LOWRES_CU_SIZE * X265_LOWRES_SCALE))
>          {
> -            for (block_x = 0; block_x < maxCol; block_x += 16)
> +            for (block_x = 0; block_x < maxCol; block_x +=  (X265_LOWRES_CU_SIZE * X265_LOWRES_SCALE))
>              {
>                  if (m_param->rc.aqMode == X265_AQ_AUTO_VARIANCE)
>                  {
>                      qp_adj = pic->m_lowres.qpCuTreeOffset[block_xy];
>                      qp_adj = strength * (qp_adj - avg_adj);
>                  }
>                  else
>                  {
> @@ -297,18 +297,18 @@ void RateControl::calcAdaptiveQuantFrame
>              pic->m_lowres.wp_ssd[i] = ssd - (sum * sum + (width[i] * height[i]) / 2) / (width[i] * height[i]);
>          }
>      }
>  }
>  RateControl::RateControl(x265_param *p)
>  {
>      m_param = p;
> -    int lowresCuWidth = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
> -    int lowresCuHeight = ((m_param->sourceHeight / 2)  + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
> +    int lowresCuWidth = ((m_param->sourceWidth / X265_LOWRES_SCALE) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
> +    int lowresCuHeight = ((m_param->sourceHeight / X265_LOWRES_SCALE)  + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
>      m_ncu = lowresCuWidth * lowresCuHeight;
>      if (m_param->rc.cuTree)
>          m_qCompress = 1;
>      else
>          m_qCompress = m_param->rc.qCompress;
>      // validate for param->rc, maybe it is need to add a function like x265_parameters_valiate()
> diff --git a/source/encoder/slicetype.cpp b/source/encoder/slicetype.cpp
> --- a/source/encoder/slicetype.cpp
> +++ b/source/encoder/slicetype.cpp
> @@ -61,18 +61,18 @@ Lookahead::Lookahead(x265_param *param,
>  {
>      m_bReady = 0;
>      m_param = param;
>      m_top = enc;
>      m_lastKeyframe = -m_param->keyframeMax;
>      m_lastNonB = NULL;
>      m_bFilling = true;
>      m_bFlushed = false;
> -    m_widthInCU = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
> -    m_heightInCU = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
> +    m_widthInCU = ((m_param->sourceWidth / X265_LOWRES_SCALE) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
> +    m_heightInCU = ((m_param->sourceHeight / X265_LOWRES_SCALE) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
>      m_scratch = (int*)x265_malloc(m_widthInCU * sizeof(int));
>      memset(m_histogram, 0, sizeof(m_histogram));
>  }
>  Lookahead::~Lookahead() { }
>  void Lookahead::init()
>  {
> @@ -1205,18 +1205,18 @@ CostEstimate::~CostEstimate()
>      }
>      delete[] m_rows;
>  }
>  void CostEstimate::init(x265_param *_param, Frame *pic)
>  {
>      m_param = _param;
> -    m_widthInCU = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
> -    m_heightInCU = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
> +    m_widthInCU = ((m_param->sourceWidth / X265_LOWRES_SCALE) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
> +    m_heightInCU = ((m_param->sourceHeight / X265_LOWRES_SCALE) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS;
>      m_rows = new EstimateRow[m_heightInCU];
>      for (int i = 0; i < m_heightInCU; i++)
>      {
>          m_rows[i].m_widthInCU = m_widthInCU;
>          m_rows[i].m_heightInCU = m_heightInCU;
>      }
> diff --git a/source/encoder/weightPrediction.cpp b/source/encoder/weightPrediction.cpp
> --- a/source/encoder/weightPrediction.cpp
> +++ b/source/encoder/weightPrediction.cpp
> @@ -52,39 +52,40 @@ int sliceHeaderCost(WeightParam *w, int
>      return lambda * (10 + denomCost + 2 * (bs_size_se(w[0].inputWeight) + bs_size_se(w[0].inputOffset)));
>  }
>  /* make a motion compensated copy of lowres ref into mcout with the same stride.
>   * The borders of mcout are not extended */
>  void mcLuma(pixel* mcout, Lowres& ref, const MV * mvs)
>  {
>      int stride = ref.lumaStride;
> -    const int cuSize = 8;
> +    const int cuSize = X265_LOWRES_CU_SIZE;
> +    const int partSize = partitionFromSizes(cuSize, cuSize);
>      MV mvmin, mvmax;
>      int cu = 0;
>      for (int y = 0; y < ref.lines; y += cuSize)
>      {
>          int pixoff = y * stride;
>          mvmin.y = (int16_t)((-y - 8) << 2);
>          mvmax.y = (int16_t)((ref.lines - y - 1 + 8) << 2);
>          for (int x = 0; x < ref.width; x += cuSize, pixoff += cuSize, cu++)
>          {
> -            ALIGN_VAR_16(pixel, buf8x8[8 * 8]);
> -            intptr_t bstride = 8;
> +            ALIGN_VAR_16(pixel, buf[cuSize * cuSize]);
> +            intptr_t bstride = cuSize;
>              mvmin.x = (int16_t)((-x - 8) << 2);
>              mvmax.x = (int16_t)((ref.width - x - 1 + 8) << 2);
>              /* clip MV to available pixels */
>              MV mv = mvs[cu];
>              mv = mv.clipped(mvmin, mvmax);
> -            pixel *tmp = ref.lowresMC(pixoff, mv, buf8x8, bstride);
> -            primitives.luma_copy_pp[LUMA_8x8](mcout + pixoff, stride, tmp, bstride);
> +            pixel *tmp = ref.lowresMC(pixoff, mv, buf, bstride);
> +            primitives.luma_copy_pp[partSize](mcout + pixoff, stride, tmp, bstride);
>          }
>      }
>      x265_emms();
>  }
>  /* use lowres MVs from lookahead to generate a motion compensated chroma plane.
>   * if a block had cheaper lowres cost as intra, we treat it as MV 0 */
> @@ -94,71 +95,73 @@ void mcChroma(pixel *      mcout,
>                const MV *   mvs,
>                const Cache& cache,
>                int          height,
>                int          width)
>  {
>      /* the motion vectors correspond to 8x8 lowres luma blocks, or 16x16 fullres
>       * luma blocks. We have to adapt block size to chroma csp */
>      int csp = cache.csp;
> -    int bw = 16 >> cache.hshift;
> -    int bh = 16 >> cache.vshift;
> +    int cuSize = X265_LOWRES_CU_SIZE * X265_LOWRES_SCALE;
> +    const int partSize = partitionFromSizes(cuSize, cuSize);
> +    int bw = cuSize >> cache.hshift;
> +    int bh = cuSize >> cache.vshift;
>      MV mvmin, mvmax;
>      for (int y = 0; y < height; y += bh)
>      {
>          /* note: lowres block count per row might be different from chroma block
>           * count per row because of rounding issues, so be very careful with indexing
>           * into the lowres structures */
>          int cu = y * cache.lowresWidthInCU;
>          int pixoff = y * stride;
>          mvmin.y = (int16_t)((-y - 8) << 2);
>          mvmax.y = (int16_t)((height - y - 1 + 8) << 2);
>          for (int x = 0; x < width; x += bw, cu++, pixoff += bw)
>          {
>              if (x < cache.lowresWidthInCU && y < cache.lowresHeightInCU)
>              {
> -                MV mv = mvs[cu]; // lowres MV
> -                mv <<= 1;        // fullres MV
> +                MV mv = mvs[cu];         // lowres MV
> +                mv *= X265_LOWRES_SCALE; // fullres MV
>                  mv.x >>= cache.hshift;
>                  mv.y >>= cache.vshift;
>                  /* clip MV to available pixels */
>                  mvmin.x = (int16_t)((-x - 8) << 2);
>                  mvmax.x = (int16_t)((width - x - 1 + 8) << 2);
>                  mv = mv.clipped(mvmin, mvmax);
>                  int fpeloffset = (mv.y >> 2) * stride + (mv.x >> 2);
>                  pixel *temp = src + pixoff + fpeloffset;
>                  int xFrac = mv.x & 0x7;
>                  int yFrac = mv.y & 0x7;
>                  if ((yFrac | xFrac) == 0)
>                  {
> -                    primitives.chroma[csp].copy_pp[LUMA_16x16](mcout + pixoff, stride, temp, stride);
> +                    primitives.chroma[csp].copy_pp[partSize](mcout + pixoff, stride, temp, stride);
>                  }
>                  else if (yFrac == 0)
>                  {
> -                    primitives.chroma[csp].filter_hpp[LUMA_16x16](temp, stride, mcout + pixoff, stride, xFrac);
> +                    primitives.chroma[csp].filter_hpp[partSize](temp, stride, mcout + pixoff, stride, xFrac);
>                  }
>                  else if (xFrac == 0)
>                  {
> -                    primitives.chroma[csp].filter_vpp[LUMA_16x16](temp, stride, mcout + pixoff, stride, yFrac);
> +                    primitives.chroma[csp].filter_vpp[partSize](temp, stride, mcout + pixoff, stride, yFrac);
>                  }
>                  else
>                  {
> -                    ALIGN_VAR_16(int16_t, imm[16 * (16 + NTAPS_CHROMA)]);
> -                    primitives.chroma[csp].filter_hps[LUMA_16x16](temp, stride, imm, bw, xFrac, 1);
> -                    primitives.chroma[csp].filter_vsp[LUMA_16x16](imm + ((NTAPS_CHROMA >> 1) - 1) * bw, bw, mcout + pixoff, stride, yFrac);
> +                    ALIGN_VAR_16(int16_t, imm[cuSize * (cuSize + NTAPS_CHROMA)]);
> +                    primitives.chroma[csp].filter_hps[partSize](temp, stride, imm, bw, xFrac, 1);
> +                    primitives.chroma[csp].filter_vsp[partSize](imm + ((NTAPS_CHROMA >> 1) - 1) * bw, bw, mcout + pixoff, stride, yFrac);
>                  }
>              }
>              else
>              {
> -                primitives.chroma[csp].copy_pp[LUMA_16x16](mcout + pixoff, stride, src + pixoff, stride);
> +                primitives.chroma[csp].copy_pp[partSize](mcout + pixoff, stride, src + pixoff, stride);
>              }
>          }
>      }
>      x265_emms();
>  }
>  /* Measure sum of 8x8 satd costs between source frame and reference
> @@ -191,21 +194,24 @@ uint32_t weightCost(pixel *         fenc
>      }
>      uint32_t cost = 0;
>      pixel *f = fenc, *r = ref;
>      if (bLuma)
>      {
>          int cu = 0;
> -        for (int y = 8; y < height; y += 8, r += 8 * stride, f += 8 * stride)
> +        int cuSize = X265_LOWRES_CU_SIZE;

I think you're missing a scale here

> +        int partSize = partitionFromSizes(cuSize, cuSize);
> +
> +        for (int y = cuSize; y < height; y += cuSize, r += cuSize * stride, f += cuSize * stride)
>          {
> -            for (int x = 8; x < width; x += 8, cu++)
> +            for (int x = cuSize; x < width; x += cuSize, cu++)
>              {
> -                int cmp = primitives.satd[LUMA_8x8](r + x, stride, f + x, stride);
> +                int cmp = primitives.satd[partSize](r + x, stride, f + x, stride);
>                  cost += X265_MIN(cmp, cache.intraCost[cu]);
>              }
>          }
>      }
>      else if (cache.csp == X265_CSP_I444)
>          for (int y = 16; y < height; y += 16, r += 16 * stride, f += 16 * stride)
>          {
>              for (int x = 16; x < width; x += 16)
> @@ -235,18 +241,18 @@ void weightAnalyse(Slice& slice, x265_pa
>      TComPicYuv *fencYuv = slice.m_pic->getPicYuvOrg();
>      Lowres& fenc        = slice.m_pic->m_lowres;
>      Cache cache;
>      memset(&cache, 0, sizeof(cache));
>      cache.intraCost = fenc.intraCost;
>      cache.numPredDir = slice.isInterP() ? 1 : 2;
> -    cache.lowresWidthInCU = fenc.width >> 3;
> -    cache.lowresHeightInCU = fenc.lines >> 3;
> +    cache.lowresWidthInCU = fenc.width >> X265_LOWRES_CU_BITS;
> +    cache.lowresHeightInCU = fenc.lines >> X265_LOWRES_CU_BITS;

and I didn't see any place that actually did the multiple downscales

-- 
Steve Borho